blob: 908fc8292a0fc7319881086418ae64a45a2bb699 [file] [log] [blame]
Pablo Tello8f43d742019-03-27 09:28:32 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm.hpp"
26#include "input.hpp"
27
28namespace winograd
29{
30
31#ifdef __aarch64__
32
33template <>
34void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
35 int n_channels,
36 const float* input_base,
37 const int input_row_stride,
38 const int input_col_stride,
39 float* matrix_base,
40 const int matrix_stride
41)
42{
43 const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
44 __asm__ __volatile__(
45 "ldr q0, [%[pcoeffs]]\n"
46 "add x25, %[inptr0], %[input_row_stride]\n"
47 "add x18, %[input_col_stride1], %[input_col_stride1]\n"
48 "add x16, x25, %[input_row_stride]\n"
49 "add x19, x18, %[input_col_stride1]\n"
50 "add x26, x16, %[input_row_stride]\n"
51 "add x20, x19, %[input_col_stride1]\n"
52 "add x17, x26, %[input_row_stride]\n"
53 "add x21, x20, %[input_col_stride1]\n"
54 "add x27, x17, %[input_row_stride]\n"
55 "add x28, %[outptr0], %[output_row_stride]\n"
56 "add x11, %[output_col_stride1], %[output_col_stride1]\n"
57 "add x22, x28, %[output_row_stride]\n"
58 "add x13, x11, %[output_col_stride1]\n"
59 "add x12, x22, %[output_row_stride]\n"
60 "add x23, x13, %[output_col_stride1]\n"
61 "add x14, x12, %[output_row_stride]\n"
62 "add x15, x23, %[output_col_stride1]\n"
63 "add x24, x14, %[output_row_stride]\n"
64 "cmp %w[n_channels], #4\n"
65 "blt 2f\n"
66 "1:\n"
67 "ldr q8, [%[inptr0], x20]\n"
68 "ldr q2, [%[inptr0], x18]\n"
69 "mov v14.16b, v8.16b\n"
70 "ldr q9, [%[inptr0]]\n"
71 "mov v10.16b, v8.16b\n"
72 "ldr q1, [%[inptr0], x21]\n"
73 "fmla v14.4s, v9.4s, v0.s[2]\n"
74 "ldr q4, [%[inptr0], x19]\n"
75 "mov v9.16b, v8.16b\n"
76 "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
77 "fmls v10.4s, v12.4s, v0.s[2]\n"
78 "ldr q5, [x16, x20]\n"
79 "fmls v14.4s, v2.4s, v0.s[3]\n"
80 "ldr q20, [x16, x18]\n"
81 "fmla v9.4s, v12.4s, v0.s[2]\n"
82 "ldr q3, [x16]\n"
83 "fmls v10.4s, v2.4s, v0.s[2]\n"
84 "ldr q6, [x16, x21]\n"
85 "mov v7.16b, v8.16b\n"
86 "ldr q16, [x16, x19]\n"
87 "fmls v9.4s, v2.4s, v0.s[2]\n"
88 "ldr q22, [x16, %[input_col_stride1]]\n"
89 "fadd v10.4s, v10.4s, v4.4s\n"
90 "ldr q17, [x17, x20]\n"
91 "fmls v7.4s, v12.4s, v0.s[1]\n"
92 "ldr q15, [x17, x18]\n"
93 "fsub v9.4s, v9.4s, v4.4s\n"
94 "ldr q19, [x17]\n"
95 "mov v8.16b, v8.16b\n"
96 "ldr q18, [x17, x21]\n"
97 "fsub v7.4s, v7.4s, v2.4s\n"
98 "ldr q13, [x17, x19]\n"
99 "fmla v7.4s, v4.4s, v0.s[1]\n"
100 "ldr q21, [x17, %[input_col_stride1]]\n"
101 "fmla v8.4s, v12.4s, v0.s[1]\n"
102 "add %[inptr0], %[inptr0], #16\n"
103 "mov v11.16b, v1.16b\n"
104 "add x16, x16, #16\n"
105 "mov v1.16b, v5.16b\n"
106 "add x17, x17, #16\n"
107 "fsub v8.4s, v8.4s, v2.4s\n"
108 "fmla v11.4s, v12.4s, v0.s[2]\n"
109 "fmls v8.4s, v4.4s, v0.s[1]\n"
110 "fmla v1.4s, v3.4s, v0.s[2]\n"
111 "mov v2.16b, v5.16b\n"
112 "mov v3.16b, v5.16b\n"
113 "fmls v11.4s, v4.4s, v0.s[3]\n"
114 "mov v4.16b, v5.16b\n"
115 "fmls v1.4s, v20.4s, v0.s[3]\n"
116 "fmls v2.4s, v22.4s, v0.s[2]\n"
117 "fmla v3.4s, v22.4s, v0.s[2]\n"
118 "fmls v4.4s, v22.4s, v0.s[1]\n"
119 "mov v5.16b, v5.16b\n"
120 "mov v6.16b, v6.16b\n"
121 "fmls v2.4s, v20.4s, v0.s[2]\n"
122 "mov v12.16b, v17.16b\n"
123 "fmls v3.4s, v20.4s, v0.s[2]\n"
124 "fsub v4.4s, v4.4s, v20.4s\n"
125 "fmla v4.4s, v16.4s, v0.s[1]\n"
126 "fmla v5.4s, v22.4s, v0.s[1]\n"
127 "fadd v2.4s, v2.4s, v16.4s\n"
128 "fmla v6.4s, v22.4s, v0.s[2]\n"
129 "fsub v3.4s, v3.4s, v16.4s\n"
130 "fmla v12.4s, v19.4s, v0.s[2]\n"
131 "fsub v5.4s, v5.4s, v20.4s\n"
132 "mov v19.16b, v17.16b\n"
133 "fmls v5.4s, v16.4s, v0.s[1]\n"
134 "fmls v6.4s, v16.4s, v0.s[3]\n"
135 "fmls v12.4s, v15.4s, v0.s[3]\n"
136 "fmls v19.4s, v21.4s, v0.s[2]\n"
137 "mov v20.16b, v17.16b\n"
138 "mov v16.16b, v17.16b\n"
139 "mov v17.16b, v17.16b\n"
140 "mov v18.16b, v18.16b\n"
141 "fmls v19.4s, v15.4s, v0.s[2]\n"
142 "fmla v20.4s, v21.4s, v0.s[2]\n"
143 "fmls v16.4s, v21.4s, v0.s[1]\n"
144 "fmla v17.4s, v21.4s, v0.s[1]\n"
145 "fmla v18.4s, v21.4s, v0.s[2]\n"
146 "mov v23.16b, v12.16b\n"
147 "fadd v19.4s, v19.4s, v13.4s\n"
148 "fmls v20.4s, v15.4s, v0.s[2]\n"
149 "fsub v16.4s, v16.4s, v15.4s\n"
150 "fsub v17.4s, v17.4s, v15.4s\n"
151 "fmla v16.4s, v13.4s, v0.s[1]\n"
152 "fmls v17.4s, v13.4s, v0.s[1]\n"
153 "fsub v20.4s, v20.4s, v13.4s\n"
154 "fmls v18.4s, v13.4s, v0.s[3]\n"
155 "fmla v23.4s, v14.4s, v0.s[2]\n"
156 "mov v15.16b, v19.16b\n"
157 "mov v14.16b, v20.16b\n"
158 "mov v24.16b, v16.16b\n"
159 "fmla v15.4s, v10.4s, v0.s[2]\n"
160 "mov v10.16b, v17.16b\n"
161 "fmls v23.4s, v1.4s, v0.s[3]\n"
162 "fmla v14.4s, v9.4s, v0.s[2]\n"
163 "fmla v24.4s, v7.4s, v0.s[2]\n"
164 "fmla v10.4s, v8.4s, v0.s[2]\n"
165 "fmls v15.4s, v2.4s, v0.s[3]\n"
166 "mov v7.16b, v18.16b\n"
167 "str q23, [%[outptr0]]\n"
168 "fmls v14.4s, v3.4s, v0.s[3]\n"
169 "fmls v24.4s, v4.4s, v0.s[3]\n"
170 "fmls v10.4s, v5.4s, v0.s[3]\n"
171 "str q15, [%[outptr0], %[output_col_stride1]]\n"
172 "fmla v7.4s, v11.4s, v0.s[2]\n"
173 "str q14, [%[outptr0], x11]\n"
174 "str q24, [%[outptr0], x13]\n"
175 "str q10, [%[outptr0], x23]\n"
176 "fmls v7.4s, v6.4s, v0.s[3]\n"
177 "str q7, [%[outptr0], x15]\n"
178 "add %[outptr0], %[outptr0], #16\n"
179 "mov v26.16b, v12.16b\n"
180 "mov v25.16b, v19.16b\n"
181 "ldr q11, [x25, x20]\n"
182 "mov v10.16b, v11.16b\n"
183 "ldr q23, [x25, x18]\n"
184 "mov v9.16b, v11.16b\n"
185 "ldr q7, [x25]\n"
186 "fmla v10.4s, v7.4s, v0.s[2]\n"
187 "ldr q13, [x25, x21]\n"
188 "mov v7.16b, v11.16b\n"
189 "ldr q31, [x25, x19]\n"
190 "mov v8.16b, v11.16b\n"
191 "ldr q21, [x25, %[input_col_stride1]]\n"
192 "fmls v10.4s, v23.4s, v0.s[3]\n"
193 "ldr q30, [x26, x20]\n"
194 "fmls v9.4s, v21.4s, v0.s[2]\n"
195 "ldr q29, [x26, x18]\n"
196 "fmla v7.4s, v21.4s, v0.s[2]\n"
197 "ldr q22, [x26]\n"
198 "fmls v8.4s, v21.4s, v0.s[1]\n"
199 "ldr q24, [x26, x21]\n"
200 "fmls v9.4s, v23.4s, v0.s[2]\n"
201 "ldr q27, [x26, x19]\n"
202 "fmls v7.4s, v23.4s, v0.s[2]\n"
203 "ldr q28, [x26, %[input_col_stride1]]\n"
204 "fsub v8.4s, v8.4s, v23.4s\n"
205 "add x25, x25, #16\n"
206 "fadd v9.4s, v9.4s, v31.4s\n"
207 "add x26, x26, #16\n"
208 "fsub v7.4s, v7.4s, v31.4s\n"
209 "fmla v8.4s, v31.4s, v0.s[1]\n"
210 "mov v11.16b, v11.16b\n"
211 "mov v15.16b, v13.16b\n"
212 "mov v14.16b, v30.16b\n"
213 "mov v13.16b, v30.16b\n"
214 "fmla v11.4s, v21.4s, v0.s[1]\n"
215 "fmla v15.4s, v21.4s, v0.s[2]\n"
216 "fmla v14.4s, v22.4s, v0.s[2]\n"
217 "fmls v13.4s, v28.4s, v0.s[2]\n"
218 "mov v21.16b, v30.16b\n"
219 "mov v22.16b, v30.16b\n"
220 "fsub v11.4s, v11.4s, v23.4s\n"
221 "fmls v15.4s, v31.4s, v0.s[3]\n"
222 "fmls v11.4s, v31.4s, v0.s[1]\n"
223 "fmls v14.4s, v29.4s, v0.s[3]\n"
224 "fmls v13.4s, v29.4s, v0.s[2]\n"
225 "fmla v21.4s, v28.4s, v0.s[2]\n"
226 "fmls v22.4s, v28.4s, v0.s[1]\n"
227 "mov v23.16b, v30.16b\n"
228 "mov v24.16b, v24.16b\n"
229 "fmls v26.4s, v10.4s, v0.s[2]\n"
230 "fadd v13.4s, v13.4s, v27.4s\n"
231 "fmls v21.4s, v29.4s, v0.s[2]\n"
232 "fsub v22.4s, v22.4s, v29.4s\n"
233 "fmla v23.4s, v28.4s, v0.s[1]\n"
234 "fmla v22.4s, v27.4s, v0.s[1]\n"
235 "fmla v24.4s, v28.4s, v0.s[2]\n"
236 "fsub v21.4s, v21.4s, v27.4s\n"
237 "fmls v26.4s, v1.4s, v0.s[2]\n"
238 "fsub v23.4s, v23.4s, v29.4s\n"
239 "fmls v25.4s, v9.4s, v0.s[2]\n"
240 "fmls v23.4s, v27.4s, v0.s[1]\n"
241 "fmls v24.4s, v27.4s, v0.s[3]\n"
242 "fadd v26.4s, v26.4s, v14.4s\n"
243 "mov v27.16b, v20.16b\n"
244 "str q26, [x28]\n"
245 "fmls v25.4s, v2.4s, v0.s[2]\n"
246 "fmls v27.4s, v7.4s, v0.s[2]\n"
247 "mov v31.16b, v16.16b\n"
248 "mov v30.16b, v17.16b\n"
249 "mov v29.16b, v18.16b\n"
250 "fadd v25.4s, v25.4s, v13.4s\n"
251 "fmls v31.4s, v8.4s, v0.s[2]\n"
252 "str q25, [x28, %[output_col_stride1]]\n"
253 "fmls v27.4s, v3.4s, v0.s[2]\n"
254 "fmls v30.4s, v11.4s, v0.s[2]\n"
255 "fmls v29.4s, v15.4s, v0.s[2]\n"
256 "fmls v31.4s, v4.4s, v0.s[2]\n"
257 "mov v26.16b, v12.16b\n"
258 "fadd v27.4s, v27.4s, v21.4s\n"
259 "mov v25.16b, v19.16b\n"
260 "str q27, [x28, x11]\n"
261 "fmls v30.4s, v5.4s, v0.s[2]\n"
262 "fadd v31.4s, v31.4s, v22.4s\n"
263 "fmls v29.4s, v6.4s, v0.s[2]\n"
264 "str q31, [x28, x13]\n"
265 "fmla v26.4s, v10.4s, v0.s[2]\n"
266 "fadd v30.4s, v30.4s, v23.4s\n"
267 "fmla v25.4s, v9.4s, v0.s[2]\n"
268 "str q30, [x28, x23]\n"
269 "fadd v29.4s, v29.4s, v24.4s\n"
270 "str q29, [x28, x15]\n"
271 "fmls v26.4s, v1.4s, v0.s[2]\n"
272 "fmls v25.4s, v2.4s, v0.s[2]\n"
273 "add x28, x28, #16\n"
274 "mov v30.16b, v20.16b\n"
275 "mov v29.16b, v16.16b\n"
276 "fsub v26.4s, v26.4s, v14.4s\n"
277 "mov v28.16b, v17.16b\n"
278 "str q26, [x22]\n"
279 "fsub v25.4s, v25.4s, v13.4s\n"
280 "str q25, [x22, %[output_col_stride1]]\n"
281 "fmla v30.4s, v7.4s, v0.s[2]\n"
282 "fmla v29.4s, v8.4s, v0.s[2]\n"
283 "fmla v28.4s, v11.4s, v0.s[2]\n"
284 "mov v26.16b, v18.16b\n"
285 "mov v25.16b, v12.16b\n"
286 "fmls v30.4s, v3.4s, v0.s[2]\n"
287 "mov v31.16b, v19.16b\n"
288 "fmls v29.4s, v4.4s, v0.s[2]\n"
289 "fmls v28.4s, v5.4s, v0.s[2]\n"
290 "fmla v26.4s, v15.4s, v0.s[2]\n"
291 "fmls v25.4s, v10.4s, v0.s[1]\n"
292 "fsub v30.4s, v30.4s, v21.4s\n"
293 "fmls v31.4s, v9.4s, v0.s[1]\n"
294 "str q30, [x22, x11]\n"
295 "fsub v29.4s, v29.4s, v22.4s\n"
296 "str q29, [x22, x13]\n"
297 "fsub v28.4s, v28.4s, v23.4s\n"
298 "str q28, [x22, x23]\n"
299 "fmls v26.4s, v6.4s, v0.s[2]\n"
300 "fsub v25.4s, v25.4s, v1.4s\n"
301 "fsub v31.4s, v31.4s, v2.4s\n"
302 "fmla v25.4s, v14.4s, v0.s[1]\n"
303 "fmla v31.4s, v13.4s, v0.s[1]\n"
304 "fsub v26.4s, v26.4s, v24.4s\n"
305 "mov v27.16b, v20.16b\n"
306 "str q26, [x22, x15]\n"
307 "mov v26.16b, v16.16b\n"
308 "str q25, [x12]\n"
309 "fmls v27.4s, v7.4s, v0.s[1]\n"
310 "str q31, [x12, %[output_col_stride1]]\n"
311 "fmls v26.4s, v8.4s, v0.s[1]\n"
312 "mov v25.16b, v17.16b\n"
313 "add x22, x22, #16\n"
314 "fsub v27.4s, v27.4s, v3.4s\n"
315 "mov v28.16b, v18.16b\n"
316 "fmla v27.4s, v21.4s, v0.s[1]\n"
317 "fsub v26.4s, v26.4s, v4.4s\n"
318 "fmla v26.4s, v22.4s, v0.s[1]\n"
319 "fmls v25.4s, v11.4s, v0.s[1]\n"
320 "fmls v28.4s, v15.4s, v0.s[1]\n"
321 "mov v12.16b, v12.16b\n"
322 "str q27, [x12, x11]\n"
323 "mov v19.16b, v19.16b\n"
324 "str q26, [x12, x13]\n"
325 "fsub v25.4s, v25.4s, v5.4s\n"
326 "fmla v25.4s, v23.4s, v0.s[1]\n"
327 "fsub v28.4s, v28.4s, v6.4s\n"
328 "fmla v28.4s, v24.4s, v0.s[1]\n"
329 "fmla v12.4s, v10.4s, v0.s[1]\n"
330 "fmla v19.4s, v9.4s, v0.s[1]\n"
331 "mov v20.16b, v20.16b\n"
332 "str q25, [x12, x23]\n"
333 "mov v16.16b, v16.16b\n"
334 "str q28, [x12, x15]\n"
335 "fsub v12.4s, v12.4s, v1.4s\n"
336 "fmls v12.4s, v14.4s, v0.s[1]\n"
337 "add x12, x12, #16\n"
338 "fsub v19.4s, v19.4s, v2.4s\n"
339 "fmla v20.4s, v7.4s, v0.s[1]\n"
340 "fmls v19.4s, v13.4s, v0.s[1]\n"
341 "fmla v16.4s, v8.4s, v0.s[1]\n"
342 "str q12, [x14]\n"
343 "mov v1.16b, v17.16b\n"
344 "fsub v20.4s, v20.4s, v3.4s\n"
345 "mov v17.16b, v18.16b\n"
346 "str q19, [x14, %[output_col_stride1]]\n"
347 "fmls v20.4s, v21.4s, v0.s[1]\n"
348 "fsub v16.4s, v16.4s, v4.4s\n"
349 "fmla v1.4s, v11.4s, v0.s[1]\n"
350 "fmls v16.4s, v22.4s, v0.s[1]\n"
351 "fmla v17.4s, v15.4s, v0.s[1]\n"
352 "str q20, [x14, x11]\n"
353 "fsub v1.4s, v1.4s, v5.4s\n"
354 "str q16, [x14, x13]\n"
355 "fmls v1.4s, v23.4s, v0.s[1]\n"
356 "fsub v17.4s, v17.4s, v6.4s\n"
357 "fmls v17.4s, v24.4s, v0.s[1]\n"
358 "str q1, [x14, x23]\n"
359 "str q17, [x14, x15]\n"
360 "add x14, x14, #16\n"
361 "ldr q2, [x27, x20]\n"
362 "mov v4.16b, v2.16b\n"
363 "ldr q17, [x27, x18]\n"
364 "mov v12.16b, v2.16b\n"
365 "ldr q18, [x27]\n"
366 "fmla v4.4s, v18.4s, v0.s[2]\n"
367 "ldr q3, [x27, x21]\n"
368 "mov v6.16b, v2.16b\n"
369 "ldr q5, [x27, x19]\n"
370 "mov v1.16b, v2.16b\n"
371 "ldr q18, [x27, %[input_col_stride1]]\n"
372 "fmls v4.4s, v17.4s, v0.s[3]\n"
373 "add x27, x27, #16\n"
374 "fmls v12.4s, v18.4s, v0.s[2]\n"
375 "sub %w[n_channels], %w[n_channels], #4\n"
376 "fmla v6.4s, v18.4s, v0.s[2]\n"
377 "cmp %w[n_channels], #4\n"
378 "fmls v1.4s, v18.4s, v0.s[1]\n"
379 "mov v2.16b, v2.16b\n"
380 "fmls v12.4s, v17.4s, v0.s[2]\n"
381 "mov v3.16b, v3.16b\n"
382 "fmls v6.4s, v17.4s, v0.s[2]\n"
383 "fmla v2.4s, v18.4s, v0.s[1]\n"
384 "fsub v1.4s, v1.4s, v17.4s\n"
385 "fmla v3.4s, v18.4s, v0.s[2]\n"
386 "fadd v12.4s, v12.4s, v5.4s\n"
387 "fmla v1.4s, v5.4s, v0.s[1]\n"
388 "fsub v6.4s, v6.4s, v5.4s\n"
389 "fsub v2.4s, v2.4s, v17.4s\n"
390 "fmls v2.4s, v5.4s, v0.s[1]\n"
391 "fmls v3.4s, v5.4s, v0.s[3]\n"
392 "mov v4.16b, v4.16b\n"
393 "mov v16.16b, v12.16b\n"
394 "mov v5.16b, v6.16b\n"
395 "mov v6.16b, v1.16b\n"
396 "fmla v4.4s, v10.4s, v0.s[2]\n"
397 "fmla v16.4s, v9.4s, v0.s[2]\n"
398 "fmla v5.4s, v7.4s, v0.s[2]\n"
399 "fmla v6.4s, v8.4s, v0.s[2]\n"
400 "mov v9.16b, v2.16b\n"
401 "mov v10.16b, v3.16b\n"
402 "fmls v4.4s, v14.4s, v0.s[3]\n"
403 "fmls v16.4s, v13.4s, v0.s[3]\n"
404 "fmls v5.4s, v21.4s, v0.s[3]\n"
405 "fmls v6.4s, v22.4s, v0.s[3]\n"
406 "fmla v9.4s, v11.4s, v0.s[2]\n"
407 "fmla v10.4s, v15.4s, v0.s[2]\n"
408 "str q4, [x24]\n"
409 "str q16, [x24, %[output_col_stride1]]\n"
410 "str q5, [x24, x11]\n"
411 "str q6, [x24, x13]\n"
412 "fmls v9.4s, v23.4s, v0.s[3]\n"
413 "fmls v10.4s, v24.4s, v0.s[3]\n"
414 "str q9, [x24, x23]\n"
415 "str q10, [x24, x15]\n"
416 "add x24, x24, #16\n"
417 "bge 1b\n"
418 "2:\n"
419 "cmp %w[n_channels], #2\n"
420 "blt 3f\n"
421 "ldr d8, [%[inptr0], x20]\n"
422 "mov v14.16b, v8.16b\n"
423 "ldr d2, [%[inptr0], x18]\n"
424 "mov v10.16b, v8.16b\n"
425 "ldr d9, [%[inptr0]]\n"
426 "fmla v14.4s, v9.4s, v0.s[2]\n"
427 "ldr d1, [%[inptr0], x21]\n"
428 "mov v9.16b, v8.16b\n"
429 "ldr d4, [%[inptr0], x19]\n"
430 "mov v7.16b, v8.16b\n"
431 "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
432 "fmls v14.4s, v2.4s, v0.s[3]\n"
433 "ldr d5, [x16, x20]\n"
434 "fmls v10.4s, v12.4s, v0.s[2]\n"
435 "ldr d20, [x16, x18]\n"
436 "fmla v9.4s, v12.4s, v0.s[2]\n"
437 "ldr d3, [x16]\n"
438 "fmls v7.4s, v12.4s, v0.s[1]\n"
439 "ldr d6, [x16, x21]\n"
440 "fmls v10.4s, v2.4s, v0.s[2]\n"
441 "ldr d16, [x16, x19]\n"
442 "fmls v9.4s, v2.4s, v0.s[2]\n"
443 "ldr d22, [x16, %[input_col_stride1]]\n"
444 "fsub v7.4s, v7.4s, v2.4s\n"
445 "ldr d17, [x17, x20]\n"
446 "fadd v10.4s, v10.4s, v4.4s\n"
447 "ldr d15, [x17, x18]\n"
448 "fsub v9.4s, v9.4s, v4.4s\n"
449 "ldr d19, [x17]\n"
450 "fmla v7.4s, v4.4s, v0.s[1]\n"
451 "ldr d18, [x17, x21]\n"
452 "mov v8.16b, v8.16b\n"
453 "ldr d13, [x17, x19]\n"
454 "mov v11.16b, v1.16b\n"
455 "ldr d21, [x17, %[input_col_stride1]]\n"
456 "fmla v8.4s, v12.4s, v0.s[1]\n"
457 "add %[inptr0], %[inptr0], #8\n"
458 "fmla v11.4s, v12.4s, v0.s[2]\n"
459 "add x16, x16, #8\n"
460 "mov v1.16b, v5.16b\n"
461 "add x17, x17, #8\n"
462 "fsub v8.4s, v8.4s, v2.4s\n"
463 "mov v2.16b, v5.16b\n"
464 "fmls v8.4s, v4.4s, v0.s[1]\n"
465 "fmls v11.4s, v4.4s, v0.s[3]\n"
466 "fmla v1.4s, v3.4s, v0.s[2]\n"
467 "fmls v2.4s, v22.4s, v0.s[2]\n"
468 "mov v3.16b, v5.16b\n"
469 "mov v4.16b, v5.16b\n"
470 "mov v5.16b, v5.16b\n"
471 "mov v6.16b, v6.16b\n"
472 "fmls v1.4s, v20.4s, v0.s[3]\n"
473 "fmls v2.4s, v20.4s, v0.s[2]\n"
474 "fmla v3.4s, v22.4s, v0.s[2]\n"
475 "fmls v4.4s, v22.4s, v0.s[1]\n"
476 "fmla v5.4s, v22.4s, v0.s[1]\n"
477 "fmla v6.4s, v22.4s, v0.s[2]\n"
478 "fadd v2.4s, v2.4s, v16.4s\n"
479 "mov v12.16b, v17.16b\n"
480 "fmls v3.4s, v20.4s, v0.s[2]\n"
481 "fsub v4.4s, v4.4s, v20.4s\n"
482 "fmla v4.4s, v16.4s, v0.s[1]\n"
483 "fsub v5.4s, v5.4s, v20.4s\n"
484 "fmls v5.4s, v16.4s, v0.s[1]\n"
485 "fmls v6.4s, v16.4s, v0.s[3]\n"
486 "fsub v3.4s, v3.4s, v16.4s\n"
487 "fmla v12.4s, v19.4s, v0.s[2]\n"
488 "mov v19.16b, v17.16b\n"
489 "mov v20.16b, v17.16b\n"
490 "mov v16.16b, v17.16b\n"
491 "mov v17.16b, v17.16b\n"
492 "fmls v12.4s, v15.4s, v0.s[3]\n"
493 "fmls v19.4s, v21.4s, v0.s[2]\n"
494 "fmla v20.4s, v21.4s, v0.s[2]\n"
495 "fmls v16.4s, v21.4s, v0.s[1]\n"
496 "fmla v17.4s, v21.4s, v0.s[1]\n"
497 "mov v18.16b, v18.16b\n"
498 "fmls v19.4s, v15.4s, v0.s[2]\n"
499 "mov v23.16b, v12.16b\n"
500 "fmls v20.4s, v15.4s, v0.s[2]\n"
501 "fsub v16.4s, v16.4s, v15.4s\n"
502 "fmla v16.4s, v13.4s, v0.s[1]\n"
503 "fsub v17.4s, v17.4s, v15.4s\n"
504 "fadd v19.4s, v19.4s, v13.4s\n"
505 "fmls v17.4s, v13.4s, v0.s[1]\n"
506 "fsub v20.4s, v20.4s, v13.4s\n"
507 "fmla v18.4s, v21.4s, v0.s[2]\n"
508 "fmla v23.4s, v14.4s, v0.s[2]\n"
509 "mov v15.16b, v19.16b\n"
510 "mov v14.16b, v20.16b\n"
511 "mov v24.16b, v16.16b\n"
512 "fmls v18.4s, v13.4s, v0.s[3]\n"
513 "fmla v15.4s, v10.4s, v0.s[2]\n"
514 "fmls v23.4s, v1.4s, v0.s[3]\n"
515 "fmla v14.4s, v9.4s, v0.s[2]\n"
516 "fmla v24.4s, v7.4s, v0.s[2]\n"
517 "mov v10.16b, v17.16b\n"
518 "fmls v15.4s, v2.4s, v0.s[3]\n"
519 "mov v7.16b, v18.16b\n"
520 "str d23, [%[outptr0]]\n"
521 "fmls v14.4s, v3.4s, v0.s[3]\n"
522 "fmls v24.4s, v4.4s, v0.s[3]\n"
523 "fmla v10.4s, v8.4s, v0.s[2]\n"
524 "str d15, [%[outptr0], %[output_col_stride1]]\n"
525 "fmla v7.4s, v11.4s, v0.s[2]\n"
526 "str d14, [%[outptr0], x11]\n"
527 "fmls v10.4s, v5.4s, v0.s[3]\n"
528 "str d24, [%[outptr0], x13]\n"
529 "fmls v7.4s, v6.4s, v0.s[3]\n"
530 "str d10, [%[outptr0], x23]\n"
531 "str d7, [%[outptr0], x15]\n"
532 "add %[outptr0], %[outptr0], #8\n"
533 "mov v26.16b, v12.16b\n"
534 "mov v25.16b, v19.16b\n"
535 "ldr d11, [x25, x20]\n"
536 "mov v10.16b, v11.16b\n"
537 "ldr d23, [x25, x18]\n"
538 "mov v9.16b, v11.16b\n"
539 "ldr d7, [x25]\n"
540 "fmla v10.4s, v7.4s, v0.s[2]\n"
541 "ldr d13, [x25, x21]\n"
542 "mov v7.16b, v11.16b\n"
543 "ldr d31, [x25, x19]\n"
544 "mov v8.16b, v11.16b\n"
545 "ldr d21, [x25, %[input_col_stride1]]\n"
546 "fmls v10.4s, v23.4s, v0.s[3]\n"
547 "ldr d30, [x26, x20]\n"
548 "fmls v9.4s, v21.4s, v0.s[2]\n"
549 "ldr d29, [x26, x18]\n"
550 "fmla v7.4s, v21.4s, v0.s[2]\n"
551 "ldr d22, [x26]\n"
552 "fmls v8.4s, v21.4s, v0.s[1]\n"
553 "ldr d24, [x26, x21]\n"
554 "fmls v9.4s, v23.4s, v0.s[2]\n"
555 "ldr d27, [x26, x19]\n"
556 "fmls v7.4s, v23.4s, v0.s[2]\n"
557 "ldr d28, [x26, %[input_col_stride1]]\n"
558 "fsub v8.4s, v8.4s, v23.4s\n"
559 "add x25, x25, #8\n"
560 "fadd v9.4s, v9.4s, v31.4s\n"
561 "add x26, x26, #8\n"
562 "fsub v7.4s, v7.4s, v31.4s\n"
563 "fmla v8.4s, v31.4s, v0.s[1]\n"
564 "mov v11.16b, v11.16b\n"
565 "mov v15.16b, v13.16b\n"
566 "mov v14.16b, v30.16b\n"
567 "mov v13.16b, v30.16b\n"
568 "fmla v11.4s, v21.4s, v0.s[1]\n"
569 "fmla v15.4s, v21.4s, v0.s[2]\n"
570 "fmla v14.4s, v22.4s, v0.s[2]\n"
571 "fmls v13.4s, v28.4s, v0.s[2]\n"
572 "mov v21.16b, v30.16b\n"
573 "mov v22.16b, v30.16b\n"
574 "fsub v11.4s, v11.4s, v23.4s\n"
575 "fmls v15.4s, v31.4s, v0.s[3]\n"
576 "fmls v11.4s, v31.4s, v0.s[1]\n"
577 "fmls v14.4s, v29.4s, v0.s[3]\n"
578 "fmls v13.4s, v29.4s, v0.s[2]\n"
579 "fmla v21.4s, v28.4s, v0.s[2]\n"
580 "fmls v22.4s, v28.4s, v0.s[1]\n"
581 "mov v23.16b, v30.16b\n"
582 "mov v24.16b, v24.16b\n"
583 "fmls v26.4s, v10.4s, v0.s[2]\n"
584 "fadd v13.4s, v13.4s, v27.4s\n"
585 "fmls v21.4s, v29.4s, v0.s[2]\n"
586 "fsub v22.4s, v22.4s, v29.4s\n"
587 "fmla v23.4s, v28.4s, v0.s[1]\n"
588 "fmla v22.4s, v27.4s, v0.s[1]\n"
589 "fmla v24.4s, v28.4s, v0.s[2]\n"
590 "fsub v21.4s, v21.4s, v27.4s\n"
591 "fmls v26.4s, v1.4s, v0.s[2]\n"
592 "fsub v23.4s, v23.4s, v29.4s\n"
593 "fmls v25.4s, v9.4s, v0.s[2]\n"
594 "fmls v23.4s, v27.4s, v0.s[1]\n"
595 "fmls v24.4s, v27.4s, v0.s[3]\n"
596 "fadd v26.4s, v26.4s, v14.4s\n"
597 "mov v27.16b, v20.16b\n"
598 "str d26, [x28]\n"
599 "fmls v25.4s, v2.4s, v0.s[2]\n"
600 "fmls v27.4s, v7.4s, v0.s[2]\n"
601 "mov v31.16b, v16.16b\n"
602 "mov v30.16b, v17.16b\n"
603 "mov v29.16b, v18.16b\n"
604 "fadd v25.4s, v25.4s, v13.4s\n"
605 "fmls v31.4s, v8.4s, v0.s[2]\n"
606 "str d25, [x28, %[output_col_stride1]]\n"
607 "fmls v27.4s, v3.4s, v0.s[2]\n"
608 "fmls v30.4s, v11.4s, v0.s[2]\n"
609 "fmls v29.4s, v15.4s, v0.s[2]\n"
610 "fmls v31.4s, v4.4s, v0.s[2]\n"
611 "mov v26.16b, v12.16b\n"
612 "fadd v27.4s, v27.4s, v21.4s\n"
613 "mov v25.16b, v19.16b\n"
614 "str d27, [x28, x11]\n"
615 "fmls v30.4s, v5.4s, v0.s[2]\n"
616 "fadd v31.4s, v31.4s, v22.4s\n"
617 "fmls v29.4s, v6.4s, v0.s[2]\n"
618 "str d31, [x28, x13]\n"
619 "fmla v26.4s, v10.4s, v0.s[2]\n"
620 "fadd v30.4s, v30.4s, v23.4s\n"
621 "fmla v25.4s, v9.4s, v0.s[2]\n"
622 "str d30, [x28, x23]\n"
623 "fadd v29.4s, v29.4s, v24.4s\n"
624 "str d29, [x28, x15]\n"
625 "fmls v26.4s, v1.4s, v0.s[2]\n"
626 "fmls v25.4s, v2.4s, v0.s[2]\n"
627 "add x28, x28, #8\n"
628 "mov v30.16b, v20.16b\n"
629 "mov v29.16b, v16.16b\n"
630 "fsub v26.4s, v26.4s, v14.4s\n"
631 "mov v28.16b, v17.16b\n"
632 "str d26, [x22]\n"
633 "fsub v25.4s, v25.4s, v13.4s\n"
634 "str d25, [x22, %[output_col_stride1]]\n"
635 "fmla v30.4s, v7.4s, v0.s[2]\n"
636 "fmla v29.4s, v8.4s, v0.s[2]\n"
637 "fmla v28.4s, v11.4s, v0.s[2]\n"
638 "mov v26.16b, v18.16b\n"
639 "mov v25.16b, v12.16b\n"
640 "fmls v30.4s, v3.4s, v0.s[2]\n"
641 "mov v31.16b, v19.16b\n"
642 "fmls v29.4s, v4.4s, v0.s[2]\n"
643 "fmls v28.4s, v5.4s, v0.s[2]\n"
644 "fmla v26.4s, v15.4s, v0.s[2]\n"
645 "fmls v25.4s, v10.4s, v0.s[1]\n"
646 "fsub v30.4s, v30.4s, v21.4s\n"
647 "fmls v31.4s, v9.4s, v0.s[1]\n"
648 "str d30, [x22, x11]\n"
649 "fsub v29.4s, v29.4s, v22.4s\n"
650 "str d29, [x22, x13]\n"
651 "fsub v28.4s, v28.4s, v23.4s\n"
652 "str d28, [x22, x23]\n"
653 "fmls v26.4s, v6.4s, v0.s[2]\n"
654 "fsub v25.4s, v25.4s, v1.4s\n"
655 "fsub v31.4s, v31.4s, v2.4s\n"
656 "fmla v25.4s, v14.4s, v0.s[1]\n"
657 "fmla v31.4s, v13.4s, v0.s[1]\n"
658 "fsub v26.4s, v26.4s, v24.4s\n"
659 "mov v27.16b, v20.16b\n"
660 "str d26, [x22, x15]\n"
661 "mov v26.16b, v16.16b\n"
662 "str d25, [x12]\n"
663 "fmls v27.4s, v7.4s, v0.s[1]\n"
664 "str d31, [x12, %[output_col_stride1]]\n"
665 "fmls v26.4s, v8.4s, v0.s[1]\n"
666 "mov v25.16b, v17.16b\n"
667 "add x22, x22, #8\n"
668 "fsub v27.4s, v27.4s, v3.4s\n"
669 "mov v28.16b, v18.16b\n"
670 "fmla v27.4s, v21.4s, v0.s[1]\n"
671 "fsub v26.4s, v26.4s, v4.4s\n"
672 "fmla v26.4s, v22.4s, v0.s[1]\n"
673 "fmls v25.4s, v11.4s, v0.s[1]\n"
674 "fmls v28.4s, v15.4s, v0.s[1]\n"
675 "mov v12.16b, v12.16b\n"
676 "str d27, [x12, x11]\n"
677 "mov v19.16b, v19.16b\n"
678 "str d26, [x12, x13]\n"
679 "fsub v25.4s, v25.4s, v5.4s\n"
680 "fmla v25.4s, v23.4s, v0.s[1]\n"
681 "fsub v28.4s, v28.4s, v6.4s\n"
682 "fmla v28.4s, v24.4s, v0.s[1]\n"
683 "fmla v12.4s, v10.4s, v0.s[1]\n"
684 "fmla v19.4s, v9.4s, v0.s[1]\n"
685 "mov v20.16b, v20.16b\n"
686 "str d25, [x12, x23]\n"
687 "mov v16.16b, v16.16b\n"
688 "str d28, [x12, x15]\n"
689 "fsub v12.4s, v12.4s, v1.4s\n"
690 "fmls v12.4s, v14.4s, v0.s[1]\n"
691 "add x12, x12, #8\n"
692 "fsub v19.4s, v19.4s, v2.4s\n"
693 "fmla v20.4s, v7.4s, v0.s[1]\n"
694 "fmls v19.4s, v13.4s, v0.s[1]\n"
695 "fmla v16.4s, v8.4s, v0.s[1]\n"
696 "str d12, [x14]\n"
697 "mov v1.16b, v17.16b\n"
698 "fsub v20.4s, v20.4s, v3.4s\n"
699 "mov v17.16b, v18.16b\n"
700 "str d19, [x14, %[output_col_stride1]]\n"
701 "fmls v20.4s, v21.4s, v0.s[1]\n"
702 "fsub v16.4s, v16.4s, v4.4s\n"
703 "fmla v1.4s, v11.4s, v0.s[1]\n"
704 "fmls v16.4s, v22.4s, v0.s[1]\n"
705 "fmla v17.4s, v15.4s, v0.s[1]\n"
706 "str d20, [x14, x11]\n"
707 "fsub v1.4s, v1.4s, v5.4s\n"
708 "str d16, [x14, x13]\n"
709 "fmls v1.4s, v23.4s, v0.s[1]\n"
710 "fsub v17.4s, v17.4s, v6.4s\n"
711 "fmls v17.4s, v24.4s, v0.s[1]\n"
712 "str d1, [x14, x23]\n"
713 "str d17, [x14, x15]\n"
714 "add x14, x14, #8\n"
715 "ldr d2, [x27, x20]\n"
716 "mov v4.16b, v2.16b\n"
717 "ldr d17, [x27, x18]\n"
718 "mov v12.16b, v2.16b\n"
719 "ldr d18, [x27]\n"
720 "fmla v4.4s, v18.4s, v0.s[2]\n"
721 "ldr d3, [x27, x21]\n"
722 "mov v6.16b, v2.16b\n"
723 "ldr d5, [x27, x19]\n"
724 "mov v1.16b, v2.16b\n"
725 "ldr d18, [x27, %[input_col_stride1]]\n"
726 "fmls v4.4s, v17.4s, v0.s[3]\n"
727 "add x27, x27, #8\n"
728 "fmls v12.4s, v18.4s, v0.s[2]\n"
729 "sub %w[n_channels], %w[n_channels], #2\n"
730 "fmla v6.4s, v18.4s, v0.s[2]\n"
731 "fmls v1.4s, v18.4s, v0.s[1]\n"
732 "mov v2.16b, v2.16b\n"
733 "mov v3.16b, v3.16b\n"
734 "fmls v12.4s, v17.4s, v0.s[2]\n"
735 "mov v4.16b, v4.16b\n"
736 "fmls v6.4s, v17.4s, v0.s[2]\n"
737 "fsub v1.4s, v1.4s, v17.4s\n"
738 "fmla v1.4s, v5.4s, v0.s[1]\n"
739 "fmla v2.4s, v18.4s, v0.s[1]\n"
740 "fadd v12.4s, v12.4s, v5.4s\n"
741 "fmla v3.4s, v18.4s, v0.s[2]\n"
742 "fsub v6.4s, v6.4s, v5.4s\n"
743 "fmla v4.4s, v10.4s, v0.s[2]\n"
744 "fsub v2.4s, v2.4s, v17.4s\n"
745 "mov v16.16b, v12.16b\n"
746 "fmls v2.4s, v5.4s, v0.s[1]\n"
747 "fmls v3.4s, v5.4s, v0.s[3]\n"
748 "fmls v4.4s, v14.4s, v0.s[3]\n"
749 "fmla v16.4s, v9.4s, v0.s[2]\n"
750 "mov v5.16b, v6.16b\n"
751 "mov v6.16b, v1.16b\n"
752 "mov v9.16b, v2.16b\n"
753 "mov v10.16b, v3.16b\n"
754 "str d4, [x24]\n"
755 "fmls v16.4s, v13.4s, v0.s[3]\n"
756 "fmla v5.4s, v7.4s, v0.s[2]\n"
757 "fmla v6.4s, v8.4s, v0.s[2]\n"
758 "fmla v9.4s, v11.4s, v0.s[2]\n"
759 "fmla v10.4s, v15.4s, v0.s[2]\n"
760 "str d16, [x24, %[output_col_stride1]]\n"
761 "fmls v5.4s, v21.4s, v0.s[3]\n"
762 "fmls v6.4s, v22.4s, v0.s[3]\n"
763 "fmls v9.4s, v23.4s, v0.s[3]\n"
764 "fmls v10.4s, v24.4s, v0.s[3]\n"
765 "str d5, [x24, x11]\n"
766 "str d6, [x24, x13]\n"
767 "str d9, [x24, x23]\n"
768 "str d10, [x24, x15]\n"
769 "add x24, x24, #8\n"
770 "3:\n"
771 "cbz %w[n_channels], 4f\n"
772 "ldr s8, [%[inptr0], x20]\n"
773 "mov v14.16b, v8.16b\n"
774 "ldr s2, [%[inptr0], x18]\n"
775 "mov v10.16b, v8.16b\n"
776 "ldr s9, [%[inptr0]]\n"
777 "fmla v14.4s, v9.4s, v0.s[2]\n"
778 "ldr s1, [%[inptr0], x21]\n"
779 "mov v9.16b, v8.16b\n"
780 "ldr s4, [%[inptr0], x19]\n"
781 "mov v7.16b, v8.16b\n"
782 "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
783 "fmls v14.4s, v2.4s, v0.s[3]\n"
784 "ldr s5, [x16, x20]\n"
785 "fmls v10.4s, v12.4s, v0.s[2]\n"
786 "ldr s20, [x16, x18]\n"
787 "fmla v9.4s, v12.4s, v0.s[2]\n"
788 "ldr s3, [x16]\n"
789 "fmls v7.4s, v12.4s, v0.s[1]\n"
790 "ldr s6, [x16, x21]\n"
791 "fmls v10.4s, v2.4s, v0.s[2]\n"
792 "ldr s16, [x16, x19]\n"
793 "fmls v9.4s, v2.4s, v0.s[2]\n"
794 "ldr s22, [x16, %[input_col_stride1]]\n"
795 "fsub v7.4s, v7.4s, v2.4s\n"
796 "ldr s17, [x17, x20]\n"
797 "fadd v10.4s, v10.4s, v4.4s\n"
798 "ldr s15, [x17, x18]\n"
799 "fsub v9.4s, v9.4s, v4.4s\n"
800 "ldr s19, [x17]\n"
801 "fmla v7.4s, v4.4s, v0.s[1]\n"
802 "ldr s18, [x17, x21]\n"
803 "mov v8.16b, v8.16b\n"
804 "ldr s13, [x17, x19]\n"
805 "mov v11.16b, v1.16b\n"
806 "ldr s21, [x17, %[input_col_stride1]]\n"
807 "fmla v8.4s, v12.4s, v0.s[1]\n"
808 "add %[inptr0], %[inptr0], #4\n"
809 "fmla v11.4s, v12.4s, v0.s[2]\n"
810 "add x16, x16, #4\n"
811 "mov v1.16b, v5.16b\n"
812 "add x17, x17, #4\n"
813 "fsub v8.4s, v8.4s, v2.4s\n"
814 "mov v2.16b, v5.16b\n"
815 "fmls v8.4s, v4.4s, v0.s[1]\n"
816 "fmls v11.4s, v4.4s, v0.s[3]\n"
817 "fmla v1.4s, v3.4s, v0.s[2]\n"
818 "fmls v2.4s, v22.4s, v0.s[2]\n"
819 "mov v3.16b, v5.16b\n"
820 "mov v4.16b, v5.16b\n"
821 "mov v5.16b, v5.16b\n"
822 "mov v6.16b, v6.16b\n"
823 "fmls v1.4s, v20.4s, v0.s[3]\n"
824 "fmls v2.4s, v20.4s, v0.s[2]\n"
825 "fmla v3.4s, v22.4s, v0.s[2]\n"
826 "fmls v4.4s, v22.4s, v0.s[1]\n"
827 "fmla v5.4s, v22.4s, v0.s[1]\n"
828 "fmla v6.4s, v22.4s, v0.s[2]\n"
829 "fadd v2.4s, v2.4s, v16.4s\n"
830 "mov v12.16b, v17.16b\n"
831 "fmls v3.4s, v20.4s, v0.s[2]\n"
832 "fsub v4.4s, v4.4s, v20.4s\n"
833 "fmla v4.4s, v16.4s, v0.s[1]\n"
834 "fsub v5.4s, v5.4s, v20.4s\n"
835 "fmls v5.4s, v16.4s, v0.s[1]\n"
836 "fmls v6.4s, v16.4s, v0.s[3]\n"
837 "fsub v3.4s, v3.4s, v16.4s\n"
838 "fmla v12.4s, v19.4s, v0.s[2]\n"
839 "mov v19.16b, v17.16b\n"
840 "mov v20.16b, v17.16b\n"
841 "mov v16.16b, v17.16b\n"
842 "mov v17.16b, v17.16b\n"
843 "fmls v12.4s, v15.4s, v0.s[3]\n"
844 "fmls v19.4s, v21.4s, v0.s[2]\n"
845 "fmla v20.4s, v21.4s, v0.s[2]\n"
846 "fmls v16.4s, v21.4s, v0.s[1]\n"
847 "fmla v17.4s, v21.4s, v0.s[1]\n"
848 "mov v18.16b, v18.16b\n"
849 "fmls v19.4s, v15.4s, v0.s[2]\n"
850 "mov v23.16b, v12.16b\n"
851 "fmls v20.4s, v15.4s, v0.s[2]\n"
852 "fsub v16.4s, v16.4s, v15.4s\n"
853 "fmla v16.4s, v13.4s, v0.s[1]\n"
854 "fsub v17.4s, v17.4s, v15.4s\n"
855 "fadd v19.4s, v19.4s, v13.4s\n"
856 "fmls v17.4s, v13.4s, v0.s[1]\n"
857 "fsub v20.4s, v20.4s, v13.4s\n"
858 "fmla v18.4s, v21.4s, v0.s[2]\n"
859 "fmla v23.4s, v14.4s, v0.s[2]\n"
860 "mov v15.16b, v19.16b\n"
861 "mov v14.16b, v20.16b\n"
862 "mov v24.16b, v16.16b\n"
863 "fmls v18.4s, v13.4s, v0.s[3]\n"
864 "fmla v15.4s, v10.4s, v0.s[2]\n"
865 "fmls v23.4s, v1.4s, v0.s[3]\n"
866 "fmla v14.4s, v9.4s, v0.s[2]\n"
867 "fmla v24.4s, v7.4s, v0.s[2]\n"
868 "mov v10.16b, v17.16b\n"
869 "fmls v15.4s, v2.4s, v0.s[3]\n"
870 "mov v7.16b, v18.16b\n"
871 "str s23, [%[outptr0]]\n"
872 "fmls v14.4s, v3.4s, v0.s[3]\n"
873 "fmls v24.4s, v4.4s, v0.s[3]\n"
874 "fmla v10.4s, v8.4s, v0.s[2]\n"
875 "str s15, [%[outptr0], %[output_col_stride1]]\n"
876 "fmla v7.4s, v11.4s, v0.s[2]\n"
877 "str s14, [%[outptr0], x11]\n"
878 "fmls v10.4s, v5.4s, v0.s[3]\n"
879 "str s24, [%[outptr0], x13]\n"
880 "fmls v7.4s, v6.4s, v0.s[3]\n"
881 "str s10, [%[outptr0], x23]\n"
882 "str s7, [%[outptr0], x15]\n"
883 "add %[outptr0], %[outptr0], #4\n"
884 "mov v26.16b, v12.16b\n"
885 "mov v25.16b, v19.16b\n"
886 "ldr s11, [x25, x20]\n"
887 "mov v10.16b, v11.16b\n"
888 "ldr s23, [x25, x18]\n"
889 "mov v9.16b, v11.16b\n"
890 "ldr s7, [x25]\n"
891 "fmla v10.4s, v7.4s, v0.s[2]\n"
892 "ldr s13, [x25, x21]\n"
893 "mov v7.16b, v11.16b\n"
894 "ldr s31, [x25, x19]\n"
895 "mov v8.16b, v11.16b\n"
896 "ldr s21, [x25, %[input_col_stride1]]\n"
897 "fmls v10.4s, v23.4s, v0.s[3]\n"
898 "ldr s30, [x26, x20]\n"
899 "fmls v9.4s, v21.4s, v0.s[2]\n"
900 "ldr s29, [x26, x18]\n"
901 "fmla v7.4s, v21.4s, v0.s[2]\n"
902 "ldr s22, [x26]\n"
903 "fmls v8.4s, v21.4s, v0.s[1]\n"
904 "ldr s24, [x26, x21]\n"
905 "fmls v9.4s, v23.4s, v0.s[2]\n"
906 "ldr s27, [x26, x19]\n"
907 "fmls v7.4s, v23.4s, v0.s[2]\n"
908 "ldr s28, [x26, %[input_col_stride1]]\n"
909 "fsub v8.4s, v8.4s, v23.4s\n"
910 "add x25, x25, #4\n"
911 "fadd v9.4s, v9.4s, v31.4s\n"
912 "add x26, x26, #4\n"
913 "fsub v7.4s, v7.4s, v31.4s\n"
914 "fmla v8.4s, v31.4s, v0.s[1]\n"
915 "mov v11.16b, v11.16b\n"
916 "mov v15.16b, v13.16b\n"
917 "mov v14.16b, v30.16b\n"
918 "mov v13.16b, v30.16b\n"
919 "fmla v11.4s, v21.4s, v0.s[1]\n"
920 "fmla v15.4s, v21.4s, v0.s[2]\n"
921 "fmla v14.4s, v22.4s, v0.s[2]\n"
922 "fmls v13.4s, v28.4s, v0.s[2]\n"
923 "mov v21.16b, v30.16b\n"
924 "mov v22.16b, v30.16b\n"
925 "fsub v11.4s, v11.4s, v23.4s\n"
926 "fmls v15.4s, v31.4s, v0.s[3]\n"
927 "fmls v11.4s, v31.4s, v0.s[1]\n"
928 "fmls v14.4s, v29.4s, v0.s[3]\n"
929 "fmls v13.4s, v29.4s, v0.s[2]\n"
930 "fmla v21.4s, v28.4s, v0.s[2]\n"
931 "fmls v22.4s, v28.4s, v0.s[1]\n"
932 "mov v23.16b, v30.16b\n"
933 "mov v24.16b, v24.16b\n"
934 "fmls v26.4s, v10.4s, v0.s[2]\n"
935 "fadd v13.4s, v13.4s, v27.4s\n"
936 "fmls v21.4s, v29.4s, v0.s[2]\n"
937 "fsub v22.4s, v22.4s, v29.4s\n"
938 "fmla v23.4s, v28.4s, v0.s[1]\n"
939 "fmla v22.4s, v27.4s, v0.s[1]\n"
940 "fmla v24.4s, v28.4s, v0.s[2]\n"
941 "fsub v21.4s, v21.4s, v27.4s\n"
942 "fmls v26.4s, v1.4s, v0.s[2]\n"
943 "fsub v23.4s, v23.4s, v29.4s\n"
944 "fmls v25.4s, v9.4s, v0.s[2]\n"
945 "fmls v23.4s, v27.4s, v0.s[1]\n"
946 "fmls v24.4s, v27.4s, v0.s[3]\n"
947 "fadd v26.4s, v26.4s, v14.4s\n"
948 "mov v27.16b, v20.16b\n"
949 "str s26, [x28]\n"
950 "fmls v25.4s, v2.4s, v0.s[2]\n"
951 "fmls v27.4s, v7.4s, v0.s[2]\n"
952 "mov v31.16b, v16.16b\n"
953 "mov v30.16b, v17.16b\n"
954 "mov v29.16b, v18.16b\n"
955 "fadd v25.4s, v25.4s, v13.4s\n"
956 "fmls v31.4s, v8.4s, v0.s[2]\n"
957 "str s25, [x28, %[output_col_stride1]]\n"
958 "fmls v27.4s, v3.4s, v0.s[2]\n"
959 "fmls v30.4s, v11.4s, v0.s[2]\n"
960 "fmls v29.4s, v15.4s, v0.s[2]\n"
961 "fmls v31.4s, v4.4s, v0.s[2]\n"
962 "mov v26.16b, v12.16b\n"
963 "fadd v27.4s, v27.4s, v21.4s\n"
964 "mov v25.16b, v19.16b\n"
965 "str s27, [x28, x11]\n"
966 "fmls v30.4s, v5.4s, v0.s[2]\n"
967 "fadd v31.4s, v31.4s, v22.4s\n"
968 "fmls v29.4s, v6.4s, v0.s[2]\n"
969 "str s31, [x28, x13]\n"
970 "fmla v26.4s, v10.4s, v0.s[2]\n"
971 "fadd v30.4s, v30.4s, v23.4s\n"
972 "fmla v25.4s, v9.4s, v0.s[2]\n"
973 "str s30, [x28, x23]\n"
974 "fadd v29.4s, v29.4s, v24.4s\n"
975 "str s29, [x28, x15]\n"
976 "fmls v26.4s, v1.4s, v0.s[2]\n"
977 "fmls v25.4s, v2.4s, v0.s[2]\n"
978 "add x28, x28, #4\n"
979 "mov v30.16b, v20.16b\n"
980 "mov v29.16b, v16.16b\n"
981 "fsub v26.4s, v26.4s, v14.4s\n"
982 "mov v28.16b, v17.16b\n"
983 "str s26, [x22]\n"
984 "fsub v25.4s, v25.4s, v13.4s\n"
985 "str s25, [x22, %[output_col_stride1]]\n"
986 "fmla v30.4s, v7.4s, v0.s[2]\n"
987 "fmla v29.4s, v8.4s, v0.s[2]\n"
988 "fmla v28.4s, v11.4s, v0.s[2]\n"
989 "mov v26.16b, v18.16b\n"
990 "mov v25.16b, v12.16b\n"
991 "fmls v30.4s, v3.4s, v0.s[2]\n"
992 "mov v31.16b, v19.16b\n"
993 "fmls v29.4s, v4.4s, v0.s[2]\n"
994 "fmls v28.4s, v5.4s, v0.s[2]\n"
995 "fmla v26.4s, v15.4s, v0.s[2]\n"
996 "fmls v25.4s, v10.4s, v0.s[1]\n"
997 "fsub v30.4s, v30.4s, v21.4s\n"
998 "fmls v31.4s, v9.4s, v0.s[1]\n"
999 "str s30, [x22, x11]\n"
1000 "fsub v29.4s, v29.4s, v22.4s\n"
1001 "str s29, [x22, x13]\n"
1002 "fsub v28.4s, v28.4s, v23.4s\n"
1003 "str s28, [x22, x23]\n"
1004 "fmls v26.4s, v6.4s, v0.s[2]\n"
1005 "fsub v25.4s, v25.4s, v1.4s\n"
1006 "fsub v31.4s, v31.4s, v2.4s\n"
1007 "fmla v25.4s, v14.4s, v0.s[1]\n"
1008 "fmla v31.4s, v13.4s, v0.s[1]\n"
1009 "fsub v26.4s, v26.4s, v24.4s\n"
1010 "mov v27.16b, v20.16b\n"
1011 "str s26, [x22, x15]\n"
1012 "mov v26.16b, v16.16b\n"
1013 "str s25, [x12]\n"
1014 "fmls v27.4s, v7.4s, v0.s[1]\n"
1015 "str s31, [x12, %[output_col_stride1]]\n"
1016 "fmls v26.4s, v8.4s, v0.s[1]\n"
1017 "mov v25.16b, v17.16b\n"
1018 "add x22, x22, #4\n"
1019 "fsub v27.4s, v27.4s, v3.4s\n"
1020 "mov v28.16b, v18.16b\n"
1021 "fmla v27.4s, v21.4s, v0.s[1]\n"
1022 "fsub v26.4s, v26.4s, v4.4s\n"
1023 "fmla v26.4s, v22.4s, v0.s[1]\n"
1024 "fmls v25.4s, v11.4s, v0.s[1]\n"
1025 "fmls v28.4s, v15.4s, v0.s[1]\n"
1026 "mov v12.16b, v12.16b\n"
1027 "str s27, [x12, x11]\n"
1028 "mov v19.16b, v19.16b\n"
1029 "str s26, [x12, x13]\n"
1030 "fsub v25.4s, v25.4s, v5.4s\n"
1031 "fmla v25.4s, v23.4s, v0.s[1]\n"
1032 "fsub v28.4s, v28.4s, v6.4s\n"
1033 "fmla v28.4s, v24.4s, v0.s[1]\n"
1034 "fmla v12.4s, v10.4s, v0.s[1]\n"
1035 "fmla v19.4s, v9.4s, v0.s[1]\n"
1036 "mov v20.16b, v20.16b\n"
1037 "str s25, [x12, x23]\n"
1038 "mov v16.16b, v16.16b\n"
1039 "str s28, [x12, x15]\n"
1040 "fsub v12.4s, v12.4s, v1.4s\n"
1041 "fmls v12.4s, v14.4s, v0.s[1]\n"
1042 "add x12, x12, #4\n"
1043 "fsub v19.4s, v19.4s, v2.4s\n"
1044 "fmla v20.4s, v7.4s, v0.s[1]\n"
1045 "fmls v19.4s, v13.4s, v0.s[1]\n"
1046 "fmla v16.4s, v8.4s, v0.s[1]\n"
1047 "str s12, [x14]\n"
1048 "mov v1.16b, v17.16b\n"
1049 "fsub v20.4s, v20.4s, v3.4s\n"
1050 "mov v17.16b, v18.16b\n"
1051 "str s19, [x14, %[output_col_stride1]]\n"
1052 "fmls v20.4s, v21.4s, v0.s[1]\n"
1053 "fsub v16.4s, v16.4s, v4.4s\n"
1054 "fmla v1.4s, v11.4s, v0.s[1]\n"
1055 "fmls v16.4s, v22.4s, v0.s[1]\n"
1056 "fmla v17.4s, v15.4s, v0.s[1]\n"
1057 "str s20, [x14, x11]\n"
1058 "fsub v1.4s, v1.4s, v5.4s\n"
1059 "str s16, [x14, x13]\n"
1060 "fmls v1.4s, v23.4s, v0.s[1]\n"
1061 "fsub v17.4s, v17.4s, v6.4s\n"
1062 "fmls v17.4s, v24.4s, v0.s[1]\n"
1063 "str s1, [x14, x23]\n"
1064 "str s17, [x14, x15]\n"
1065 "add x14, x14, #4\n"
1066 "ldr s2, [x27, x20]\n"
1067 "mov v4.16b, v2.16b\n"
1068 "ldr s17, [x27, x18]\n"
1069 "mov v12.16b, v2.16b\n"
1070 "ldr s18, [x27]\n"
1071 "fmla v4.4s, v18.4s, v0.s[2]\n"
1072 "ldr s3, [x27, x21]\n"
1073 "mov v6.16b, v2.16b\n"
1074 "ldr s5, [x27, x19]\n"
1075 "mov v1.16b, v2.16b\n"
1076 "ldr s18, [x27, %[input_col_stride1]]\n"
1077 "fmls v4.4s, v17.4s, v0.s[3]\n"
1078 "add x27, x27, #4\n"
1079 "fmls v12.4s, v18.4s, v0.s[2]\n"
1080 "fmla v6.4s, v18.4s, v0.s[2]\n"
1081 "fmls v1.4s, v18.4s, v0.s[1]\n"
1082 "mov v2.16b, v2.16b\n"
1083 "mov v3.16b, v3.16b\n"
1084 "mov v4.16b, v4.16b\n"
1085 "fmls v12.4s, v17.4s, v0.s[2]\n"
1086 "fmls v6.4s, v17.4s, v0.s[2]\n"
1087 "fsub v1.4s, v1.4s, v17.4s\n"
1088 "fmla v2.4s, v18.4s, v0.s[1]\n"
1089 "fmla v1.4s, v5.4s, v0.s[1]\n"
1090 "fmla v3.4s, v18.4s, v0.s[2]\n"
1091 "fadd v12.4s, v12.4s, v5.4s\n"
1092 "fsub v6.4s, v6.4s, v5.4s\n"
1093 "fsub v2.4s, v2.4s, v17.4s\n"
1094 "fmla v4.4s, v10.4s, v0.s[2]\n"
1095 "fmls v2.4s, v5.4s, v0.s[1]\n"
1096 "fmls v3.4s, v5.4s, v0.s[3]\n"
1097 "mov v16.16b, v12.16b\n"
1098 "mov v5.16b, v6.16b\n"
1099 "fmls v4.4s, v14.4s, v0.s[3]\n"
1100 "mov v6.16b, v1.16b\n"
1101 "fmla v16.4s, v9.4s, v0.s[2]\n"
1102 "fmla v5.4s, v7.4s, v0.s[2]\n"
1103 "fmla v6.4s, v8.4s, v0.s[2]\n"
1104 "mov v9.16b, v2.16b\n"
1105 "str s4, [x24]\n"
1106 "mov v10.16b, v3.16b\n"
1107 "fmls v16.4s, v13.4s, v0.s[3]\n"
1108 "fmls v5.4s, v21.4s, v0.s[3]\n"
1109 "fmls v6.4s, v22.4s, v0.s[3]\n"
1110 "fmla v9.4s, v11.4s, v0.s[2]\n"
1111 "fmla v10.4s, v15.4s, v0.s[2]\n"
1112 "str s16, [x24, %[output_col_stride1]]\n"
1113 "str s5, [x24, x11]\n"
1114 "fmls v9.4s, v23.4s, v0.s[3]\n"
1115 "str s6, [x24, x13]\n"
1116 "fmls v10.4s, v24.4s, v0.s[3]\n"
1117 "str s9, [x24, x23]\n"
1118 "str s10, [x24, x15]\n"
1119 "add x24, x24, #4\n"
1120 "4:\n"
1121 : [outptr0] "+r" (matrix_base),
1122 [n_channels] "+r" (n_channels),
1123 [inptr0] "+r" (input_base)
1124 : [pcoeffs] "r" (pcoeffs),
1125 [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
1126 [output_col_stride1] "r" (matrix_stride * sizeof(float)),
1127 [input_row_stride] "r" (input_row_stride * sizeof(float)),
1128 [input_col_stride1] "r" (input_col_stride * sizeof(float))
1129 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
1130 "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1131 "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
1132 "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
1133 "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
1134 );
1135}
1136
1137#else // __arm__ not __aarch64__
1138
1139template <>
1140void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
1141 const int n_channels,
1142 const float* const input_base,
1143 const int input_row_stride,
1144 const int input_col_stride,
1145 float* outptr,
1146 const int matrix_stride
1147)
1148{
1149 constexpr int inner_tile_rows = 6;
1150 constexpr int inner_tile_cols = 6;
1151
1152 // Get pointers into the input tile
1153 const float *x_ptrs[inner_tile_rows][inner_tile_cols];
1154 for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
1155 {
1156 // Get a pointer into the row
1157 const float* const row_ptr = input_base + xi*input_row_stride;
1158
1159 for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
1160 {
1161 x_ptrs[i][j] = row_ptr + xj*input_col_stride;
1162 }
1163 }
1164
1165 // Matrices used/computed in this kernel.
1166 float x[inner_tile_rows][inner_tile_cols];
1167 float XTx[inner_tile_rows][inner_tile_cols];
1168 float U[inner_tile_rows][inner_tile_cols];
1169 for (int i = 0; i < inner_tile_rows; i++)
1170 {
1171 for (int j = 0; j < inner_tile_cols; j++)
1172 {
1173 x[i][j] = XTx[i][j] = 0.0f;
1174 }
1175 }
1176
1177 // Perform the Winograd input transformation for each channel in the input
1178 // tensor.
1179 int channels_remaining = n_channels;
1180 for (; channels_remaining >= 2; channels_remaining -= 2)
1181 {
1182 // Matrices used/computed in this kernel
1183 float32x2_t x[inner_tile_rows][inner_tile_cols];
1184 float32x2_t XTx[inner_tile_rows][inner_tile_cols];
1185 float32x2_t U[inner_tile_rows][inner_tile_cols];
1186 for (int i = 0; i < inner_tile_rows; i++)
1187 {
1188 for (int j = 0; j < inner_tile_cols; j++)
1189 {
1190 x[i][j] = vdup_n_f32(0.0f);
1191 XTx[i][j] = vdup_n_f32(0.0f);
1192 }
1193 }
1194
1195 // Read a 6x6 tile in the Winograd domain
1196 for (int i = 0; i < inner_tile_rows; i++)
1197 {
1198 for (int j = 0; j < inner_tile_cols; j++)
1199 {
1200 x[i][j] = vld1_f32(x_ptrs[i][j]);
1201 x_ptrs[i][j] += 2;
1202 }
1203 }
1204
1205 // Compute XT . x
1206 for (int j = 0; j < inner_tile_cols; j++)
1207 {
1208 // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
1209 XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
1210
1211 // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
1212 XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
1213
1214 // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
1215 XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
1216
1217 // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
1218 XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
1219
1220 // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
1221 XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
1222
1223 // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
1224 XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
1225 }
1226
1227 // Compute U = XT . x . X
1228 for (int i = 0; i < inner_tile_rows; i++)
1229 {
1230 // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
1231 U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
1232
1233 // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
1234 U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
1235
1236 // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
1237 U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
1238
1239 // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
1240 U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
1241
1242 // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
1243 U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
1244
1245 // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
1246 U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
1247 }
1248
1249 // Store the transformed matrix
1250 for (int i = 0, m = 0; i < inner_tile_rows; i++)
1251 {
1252 for (int j = 0; j < inner_tile_cols; j++, m++)
1253 {
1254 vst1_f32(outptr + m*matrix_stride, U[i][j]);
1255 }
1256 }
1257 outptr += 2;
1258 }
1259 for (; channels_remaining; channels_remaining--)
1260 {
1261 // Load x
1262 for (int i = 0; i < inner_tile_rows; i++)
1263 {
1264 for (int j = 0; j < inner_tile_cols; j++)
1265 {
1266 x[i][j] = *(x_ptrs[i][j]++);
1267 }
1268 }
1269
1270 // Compute XT . x
1271 for (int j = 0; j < inner_tile_cols; j++)
1272 {
1273 XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
1274 XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
1275 XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
1276 XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
1277 XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
1278 XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
1279 }
1280
1281 // Compute U = XT . x . X
1282 for (int i = 0; i < inner_tile_rows; i++)
1283 {
1284 U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
1285 U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
1286 U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
1287 U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
1288 U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
1289 U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
1290 }
1291
1292 // Store the transformed matrix
1293 for (int i = 0, m = 0; i < inner_tile_rows; i++)
1294 {
1295 for (int j = 0; j < inner_tile_cols; j++, m++)
1296 {
1297 *(outptr + m*matrix_stride) = U[i][j];
1298 }
1299 }
1300 outptr++;
1301 }
1302}
1303
1304#endif
1305
1306template class InputTransform<6, 6, float, float, WinogradRoots::Integers>;
1307
1308} // namespace winograd