blob: e4aad76d97b39f5aafc1a11bc7c6739a7395b254 [file] [log] [blame]
Pablo Tello8f43d742019-03-27 09:28:32 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm.hpp"
26#include "input.hpp"
27
28namespace winograd
29{
30
31#ifdef __aarch64__
32
33template <>
34void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
35 int n_channels,
36 const float* input_base,
37 const int input_row_stride,
38 const int input_col_stride,
39 float* matrix_base,
40 const int matrix_stride
41)
42{
43 const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
44 __asm__ __volatile__(
45 "ldr q0, [%[pcoeffs]]\n"
46 "add x25, %[inptr0], %[input_row_stride]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000047 "add x9, %[input_col_stride1], %[input_col_stride1]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +000048 "add x16, x25, %[input_row_stride]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000049 "add x19, x9, %[input_col_stride1]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +000050 "add x26, x16, %[input_row_stride]\n"
51 "add x20, x19, %[input_col_stride1]\n"
52 "add x17, x26, %[input_row_stride]\n"
53 "add x21, x20, %[input_col_stride1]\n"
54 "add x27, x17, %[input_row_stride]\n"
55 "add x28, %[outptr0], %[output_row_stride]\n"
56 "add x11, %[output_col_stride1], %[output_col_stride1]\n"
57 "add x22, x28, %[output_row_stride]\n"
58 "add x13, x11, %[output_col_stride1]\n"
59 "add x12, x22, %[output_row_stride]\n"
60 "add x23, x13, %[output_col_stride1]\n"
61 "add x14, x12, %[output_row_stride]\n"
62 "add x15, x23, %[output_col_stride1]\n"
63 "add x24, x14, %[output_row_stride]\n"
64 "cmp %w[n_channels], #4\n"
65 "blt 2f\n"
66 "1:\n"
67 "ldr q8, [%[inptr0], x20]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000068 "ldr q2, [%[inptr0], x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +000069 "mov v14.16b, v8.16b\n"
70 "ldr q9, [%[inptr0]]\n"
71 "mov v10.16b, v8.16b\n"
72 "ldr q1, [%[inptr0], x21]\n"
73 "fmla v14.4s, v9.4s, v0.s[2]\n"
74 "ldr q4, [%[inptr0], x19]\n"
75 "mov v9.16b, v8.16b\n"
76 "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
77 "fmls v10.4s, v12.4s, v0.s[2]\n"
78 "ldr q5, [x16, x20]\n"
79 "fmls v14.4s, v2.4s, v0.s[3]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000080 "ldr q20, [x16, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +000081 "fmla v9.4s, v12.4s, v0.s[2]\n"
82 "ldr q3, [x16]\n"
83 "fmls v10.4s, v2.4s, v0.s[2]\n"
84 "ldr q6, [x16, x21]\n"
85 "mov v7.16b, v8.16b\n"
86 "ldr q16, [x16, x19]\n"
87 "fmls v9.4s, v2.4s, v0.s[2]\n"
88 "ldr q22, [x16, %[input_col_stride1]]\n"
89 "fadd v10.4s, v10.4s, v4.4s\n"
90 "ldr q17, [x17, x20]\n"
91 "fmls v7.4s, v12.4s, v0.s[1]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000092 "ldr q15, [x17, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +000093 "fsub v9.4s, v9.4s, v4.4s\n"
94 "ldr q19, [x17]\n"
95 "mov v8.16b, v8.16b\n"
96 "ldr q18, [x17, x21]\n"
97 "fsub v7.4s, v7.4s, v2.4s\n"
98 "ldr q13, [x17, x19]\n"
99 "fmla v7.4s, v4.4s, v0.s[1]\n"
100 "ldr q21, [x17, %[input_col_stride1]]\n"
101 "fmla v8.4s, v12.4s, v0.s[1]\n"
102 "add %[inptr0], %[inptr0], #16\n"
103 "mov v11.16b, v1.16b\n"
104 "add x16, x16, #16\n"
105 "mov v1.16b, v5.16b\n"
106 "add x17, x17, #16\n"
107 "fsub v8.4s, v8.4s, v2.4s\n"
108 "fmla v11.4s, v12.4s, v0.s[2]\n"
109 "fmls v8.4s, v4.4s, v0.s[1]\n"
110 "fmla v1.4s, v3.4s, v0.s[2]\n"
111 "mov v2.16b, v5.16b\n"
112 "mov v3.16b, v5.16b\n"
113 "fmls v11.4s, v4.4s, v0.s[3]\n"
114 "mov v4.16b, v5.16b\n"
115 "fmls v1.4s, v20.4s, v0.s[3]\n"
116 "fmls v2.4s, v22.4s, v0.s[2]\n"
117 "fmla v3.4s, v22.4s, v0.s[2]\n"
118 "fmls v4.4s, v22.4s, v0.s[1]\n"
119 "mov v5.16b, v5.16b\n"
120 "mov v6.16b, v6.16b\n"
121 "fmls v2.4s, v20.4s, v0.s[2]\n"
122 "mov v12.16b, v17.16b\n"
123 "fmls v3.4s, v20.4s, v0.s[2]\n"
124 "fsub v4.4s, v4.4s, v20.4s\n"
125 "fmla v4.4s, v16.4s, v0.s[1]\n"
126 "fmla v5.4s, v22.4s, v0.s[1]\n"
127 "fadd v2.4s, v2.4s, v16.4s\n"
128 "fmla v6.4s, v22.4s, v0.s[2]\n"
129 "fsub v3.4s, v3.4s, v16.4s\n"
130 "fmla v12.4s, v19.4s, v0.s[2]\n"
131 "fsub v5.4s, v5.4s, v20.4s\n"
132 "mov v19.16b, v17.16b\n"
133 "fmls v5.4s, v16.4s, v0.s[1]\n"
134 "fmls v6.4s, v16.4s, v0.s[3]\n"
135 "fmls v12.4s, v15.4s, v0.s[3]\n"
136 "fmls v19.4s, v21.4s, v0.s[2]\n"
137 "mov v20.16b, v17.16b\n"
138 "mov v16.16b, v17.16b\n"
139 "mov v17.16b, v17.16b\n"
140 "mov v18.16b, v18.16b\n"
141 "fmls v19.4s, v15.4s, v0.s[2]\n"
142 "fmla v20.4s, v21.4s, v0.s[2]\n"
143 "fmls v16.4s, v21.4s, v0.s[1]\n"
144 "fmla v17.4s, v21.4s, v0.s[1]\n"
145 "fmla v18.4s, v21.4s, v0.s[2]\n"
146 "mov v23.16b, v12.16b\n"
147 "fadd v19.4s, v19.4s, v13.4s\n"
148 "fmls v20.4s, v15.4s, v0.s[2]\n"
149 "fsub v16.4s, v16.4s, v15.4s\n"
150 "fsub v17.4s, v17.4s, v15.4s\n"
151 "fmla v16.4s, v13.4s, v0.s[1]\n"
152 "fmls v17.4s, v13.4s, v0.s[1]\n"
153 "fsub v20.4s, v20.4s, v13.4s\n"
154 "fmls v18.4s, v13.4s, v0.s[3]\n"
155 "fmla v23.4s, v14.4s, v0.s[2]\n"
156 "mov v15.16b, v19.16b\n"
157 "mov v14.16b, v20.16b\n"
158 "mov v24.16b, v16.16b\n"
159 "fmla v15.4s, v10.4s, v0.s[2]\n"
160 "mov v10.16b, v17.16b\n"
161 "fmls v23.4s, v1.4s, v0.s[3]\n"
162 "fmla v14.4s, v9.4s, v0.s[2]\n"
163 "fmla v24.4s, v7.4s, v0.s[2]\n"
164 "fmla v10.4s, v8.4s, v0.s[2]\n"
165 "fmls v15.4s, v2.4s, v0.s[3]\n"
166 "mov v7.16b, v18.16b\n"
167 "str q23, [%[outptr0]]\n"
168 "fmls v14.4s, v3.4s, v0.s[3]\n"
169 "fmls v24.4s, v4.4s, v0.s[3]\n"
170 "fmls v10.4s, v5.4s, v0.s[3]\n"
171 "str q15, [%[outptr0], %[output_col_stride1]]\n"
172 "fmla v7.4s, v11.4s, v0.s[2]\n"
173 "str q14, [%[outptr0], x11]\n"
174 "str q24, [%[outptr0], x13]\n"
175 "str q10, [%[outptr0], x23]\n"
176 "fmls v7.4s, v6.4s, v0.s[3]\n"
177 "str q7, [%[outptr0], x15]\n"
178 "add %[outptr0], %[outptr0], #16\n"
179 "mov v26.16b, v12.16b\n"
180 "mov v25.16b, v19.16b\n"
181 "ldr q11, [x25, x20]\n"
182 "mov v10.16b, v11.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000183 "ldr q23, [x25, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000184 "mov v9.16b, v11.16b\n"
185 "ldr q7, [x25]\n"
186 "fmla v10.4s, v7.4s, v0.s[2]\n"
187 "ldr q13, [x25, x21]\n"
188 "mov v7.16b, v11.16b\n"
189 "ldr q31, [x25, x19]\n"
190 "mov v8.16b, v11.16b\n"
191 "ldr q21, [x25, %[input_col_stride1]]\n"
192 "fmls v10.4s, v23.4s, v0.s[3]\n"
193 "ldr q30, [x26, x20]\n"
194 "fmls v9.4s, v21.4s, v0.s[2]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000195 "ldr q29, [x26, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000196 "fmla v7.4s, v21.4s, v0.s[2]\n"
197 "ldr q22, [x26]\n"
198 "fmls v8.4s, v21.4s, v0.s[1]\n"
199 "ldr q24, [x26, x21]\n"
200 "fmls v9.4s, v23.4s, v0.s[2]\n"
201 "ldr q27, [x26, x19]\n"
202 "fmls v7.4s, v23.4s, v0.s[2]\n"
203 "ldr q28, [x26, %[input_col_stride1]]\n"
204 "fsub v8.4s, v8.4s, v23.4s\n"
205 "add x25, x25, #16\n"
206 "fadd v9.4s, v9.4s, v31.4s\n"
207 "add x26, x26, #16\n"
208 "fsub v7.4s, v7.4s, v31.4s\n"
209 "fmla v8.4s, v31.4s, v0.s[1]\n"
210 "mov v11.16b, v11.16b\n"
211 "mov v15.16b, v13.16b\n"
212 "mov v14.16b, v30.16b\n"
213 "mov v13.16b, v30.16b\n"
214 "fmla v11.4s, v21.4s, v0.s[1]\n"
215 "fmla v15.4s, v21.4s, v0.s[2]\n"
216 "fmla v14.4s, v22.4s, v0.s[2]\n"
217 "fmls v13.4s, v28.4s, v0.s[2]\n"
218 "mov v21.16b, v30.16b\n"
219 "mov v22.16b, v30.16b\n"
220 "fsub v11.4s, v11.4s, v23.4s\n"
221 "fmls v15.4s, v31.4s, v0.s[3]\n"
222 "fmls v11.4s, v31.4s, v0.s[1]\n"
223 "fmls v14.4s, v29.4s, v0.s[3]\n"
224 "fmls v13.4s, v29.4s, v0.s[2]\n"
225 "fmla v21.4s, v28.4s, v0.s[2]\n"
226 "fmls v22.4s, v28.4s, v0.s[1]\n"
227 "mov v23.16b, v30.16b\n"
228 "mov v24.16b, v24.16b\n"
229 "fmls v26.4s, v10.4s, v0.s[2]\n"
230 "fadd v13.4s, v13.4s, v27.4s\n"
231 "fmls v21.4s, v29.4s, v0.s[2]\n"
232 "fsub v22.4s, v22.4s, v29.4s\n"
233 "fmla v23.4s, v28.4s, v0.s[1]\n"
234 "fmla v22.4s, v27.4s, v0.s[1]\n"
235 "fmla v24.4s, v28.4s, v0.s[2]\n"
236 "fsub v21.4s, v21.4s, v27.4s\n"
237 "fmls v26.4s, v1.4s, v0.s[2]\n"
238 "fsub v23.4s, v23.4s, v29.4s\n"
239 "fmls v25.4s, v9.4s, v0.s[2]\n"
240 "fmls v23.4s, v27.4s, v0.s[1]\n"
241 "fmls v24.4s, v27.4s, v0.s[3]\n"
242 "fadd v26.4s, v26.4s, v14.4s\n"
243 "mov v27.16b, v20.16b\n"
244 "str q26, [x28]\n"
245 "fmls v25.4s, v2.4s, v0.s[2]\n"
246 "fmls v27.4s, v7.4s, v0.s[2]\n"
247 "mov v31.16b, v16.16b\n"
248 "mov v30.16b, v17.16b\n"
249 "mov v29.16b, v18.16b\n"
250 "fadd v25.4s, v25.4s, v13.4s\n"
251 "fmls v31.4s, v8.4s, v0.s[2]\n"
252 "str q25, [x28, %[output_col_stride1]]\n"
253 "fmls v27.4s, v3.4s, v0.s[2]\n"
254 "fmls v30.4s, v11.4s, v0.s[2]\n"
255 "fmls v29.4s, v15.4s, v0.s[2]\n"
256 "fmls v31.4s, v4.4s, v0.s[2]\n"
257 "mov v26.16b, v12.16b\n"
258 "fadd v27.4s, v27.4s, v21.4s\n"
259 "mov v25.16b, v19.16b\n"
260 "str q27, [x28, x11]\n"
261 "fmls v30.4s, v5.4s, v0.s[2]\n"
262 "fadd v31.4s, v31.4s, v22.4s\n"
263 "fmls v29.4s, v6.4s, v0.s[2]\n"
264 "str q31, [x28, x13]\n"
265 "fmla v26.4s, v10.4s, v0.s[2]\n"
266 "fadd v30.4s, v30.4s, v23.4s\n"
267 "fmla v25.4s, v9.4s, v0.s[2]\n"
268 "str q30, [x28, x23]\n"
269 "fadd v29.4s, v29.4s, v24.4s\n"
270 "str q29, [x28, x15]\n"
271 "fmls v26.4s, v1.4s, v0.s[2]\n"
272 "fmls v25.4s, v2.4s, v0.s[2]\n"
273 "add x28, x28, #16\n"
274 "mov v30.16b, v20.16b\n"
275 "mov v29.16b, v16.16b\n"
276 "fsub v26.4s, v26.4s, v14.4s\n"
277 "mov v28.16b, v17.16b\n"
278 "str q26, [x22]\n"
279 "fsub v25.4s, v25.4s, v13.4s\n"
280 "str q25, [x22, %[output_col_stride1]]\n"
281 "fmla v30.4s, v7.4s, v0.s[2]\n"
282 "fmla v29.4s, v8.4s, v0.s[2]\n"
283 "fmla v28.4s, v11.4s, v0.s[2]\n"
284 "mov v26.16b, v18.16b\n"
285 "mov v25.16b, v12.16b\n"
286 "fmls v30.4s, v3.4s, v0.s[2]\n"
287 "mov v31.16b, v19.16b\n"
288 "fmls v29.4s, v4.4s, v0.s[2]\n"
289 "fmls v28.4s, v5.4s, v0.s[2]\n"
290 "fmla v26.4s, v15.4s, v0.s[2]\n"
291 "fmls v25.4s, v10.4s, v0.s[1]\n"
292 "fsub v30.4s, v30.4s, v21.4s\n"
293 "fmls v31.4s, v9.4s, v0.s[1]\n"
294 "str q30, [x22, x11]\n"
295 "fsub v29.4s, v29.4s, v22.4s\n"
296 "str q29, [x22, x13]\n"
297 "fsub v28.4s, v28.4s, v23.4s\n"
298 "str q28, [x22, x23]\n"
299 "fmls v26.4s, v6.4s, v0.s[2]\n"
300 "fsub v25.4s, v25.4s, v1.4s\n"
301 "fsub v31.4s, v31.4s, v2.4s\n"
302 "fmla v25.4s, v14.4s, v0.s[1]\n"
303 "fmla v31.4s, v13.4s, v0.s[1]\n"
304 "fsub v26.4s, v26.4s, v24.4s\n"
305 "mov v27.16b, v20.16b\n"
306 "str q26, [x22, x15]\n"
307 "mov v26.16b, v16.16b\n"
308 "str q25, [x12]\n"
309 "fmls v27.4s, v7.4s, v0.s[1]\n"
310 "str q31, [x12, %[output_col_stride1]]\n"
311 "fmls v26.4s, v8.4s, v0.s[1]\n"
312 "mov v25.16b, v17.16b\n"
313 "add x22, x22, #16\n"
314 "fsub v27.4s, v27.4s, v3.4s\n"
315 "mov v28.16b, v18.16b\n"
316 "fmla v27.4s, v21.4s, v0.s[1]\n"
317 "fsub v26.4s, v26.4s, v4.4s\n"
318 "fmla v26.4s, v22.4s, v0.s[1]\n"
319 "fmls v25.4s, v11.4s, v0.s[1]\n"
320 "fmls v28.4s, v15.4s, v0.s[1]\n"
321 "mov v12.16b, v12.16b\n"
322 "str q27, [x12, x11]\n"
323 "mov v19.16b, v19.16b\n"
324 "str q26, [x12, x13]\n"
325 "fsub v25.4s, v25.4s, v5.4s\n"
326 "fmla v25.4s, v23.4s, v0.s[1]\n"
327 "fsub v28.4s, v28.4s, v6.4s\n"
328 "fmla v28.4s, v24.4s, v0.s[1]\n"
329 "fmla v12.4s, v10.4s, v0.s[1]\n"
330 "fmla v19.4s, v9.4s, v0.s[1]\n"
331 "mov v20.16b, v20.16b\n"
332 "str q25, [x12, x23]\n"
333 "mov v16.16b, v16.16b\n"
334 "str q28, [x12, x15]\n"
335 "fsub v12.4s, v12.4s, v1.4s\n"
336 "fmls v12.4s, v14.4s, v0.s[1]\n"
337 "add x12, x12, #16\n"
338 "fsub v19.4s, v19.4s, v2.4s\n"
339 "fmla v20.4s, v7.4s, v0.s[1]\n"
340 "fmls v19.4s, v13.4s, v0.s[1]\n"
341 "fmla v16.4s, v8.4s, v0.s[1]\n"
342 "str q12, [x14]\n"
343 "mov v1.16b, v17.16b\n"
344 "fsub v20.4s, v20.4s, v3.4s\n"
345 "mov v17.16b, v18.16b\n"
346 "str q19, [x14, %[output_col_stride1]]\n"
347 "fmls v20.4s, v21.4s, v0.s[1]\n"
348 "fsub v16.4s, v16.4s, v4.4s\n"
349 "fmla v1.4s, v11.4s, v0.s[1]\n"
350 "fmls v16.4s, v22.4s, v0.s[1]\n"
351 "fmla v17.4s, v15.4s, v0.s[1]\n"
352 "str q20, [x14, x11]\n"
353 "fsub v1.4s, v1.4s, v5.4s\n"
354 "str q16, [x14, x13]\n"
355 "fmls v1.4s, v23.4s, v0.s[1]\n"
356 "fsub v17.4s, v17.4s, v6.4s\n"
357 "fmls v17.4s, v24.4s, v0.s[1]\n"
358 "str q1, [x14, x23]\n"
359 "str q17, [x14, x15]\n"
360 "add x14, x14, #16\n"
361 "ldr q2, [x27, x20]\n"
362 "mov v4.16b, v2.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000363 "ldr q17, [x27, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000364 "mov v12.16b, v2.16b\n"
365 "ldr q18, [x27]\n"
366 "fmla v4.4s, v18.4s, v0.s[2]\n"
367 "ldr q3, [x27, x21]\n"
368 "mov v6.16b, v2.16b\n"
369 "ldr q5, [x27, x19]\n"
370 "mov v1.16b, v2.16b\n"
371 "ldr q18, [x27, %[input_col_stride1]]\n"
372 "fmls v4.4s, v17.4s, v0.s[3]\n"
373 "add x27, x27, #16\n"
374 "fmls v12.4s, v18.4s, v0.s[2]\n"
375 "sub %w[n_channels], %w[n_channels], #4\n"
376 "fmla v6.4s, v18.4s, v0.s[2]\n"
377 "cmp %w[n_channels], #4\n"
378 "fmls v1.4s, v18.4s, v0.s[1]\n"
379 "mov v2.16b, v2.16b\n"
380 "fmls v12.4s, v17.4s, v0.s[2]\n"
381 "mov v3.16b, v3.16b\n"
382 "fmls v6.4s, v17.4s, v0.s[2]\n"
383 "fmla v2.4s, v18.4s, v0.s[1]\n"
384 "fsub v1.4s, v1.4s, v17.4s\n"
385 "fmla v3.4s, v18.4s, v0.s[2]\n"
386 "fadd v12.4s, v12.4s, v5.4s\n"
387 "fmla v1.4s, v5.4s, v0.s[1]\n"
388 "fsub v6.4s, v6.4s, v5.4s\n"
389 "fsub v2.4s, v2.4s, v17.4s\n"
390 "fmls v2.4s, v5.4s, v0.s[1]\n"
391 "fmls v3.4s, v5.4s, v0.s[3]\n"
392 "mov v4.16b, v4.16b\n"
393 "mov v16.16b, v12.16b\n"
394 "mov v5.16b, v6.16b\n"
395 "mov v6.16b, v1.16b\n"
396 "fmla v4.4s, v10.4s, v0.s[2]\n"
397 "fmla v16.4s, v9.4s, v0.s[2]\n"
398 "fmla v5.4s, v7.4s, v0.s[2]\n"
399 "fmla v6.4s, v8.4s, v0.s[2]\n"
400 "mov v9.16b, v2.16b\n"
401 "mov v10.16b, v3.16b\n"
402 "fmls v4.4s, v14.4s, v0.s[3]\n"
403 "fmls v16.4s, v13.4s, v0.s[3]\n"
404 "fmls v5.4s, v21.4s, v0.s[3]\n"
405 "fmls v6.4s, v22.4s, v0.s[3]\n"
406 "fmla v9.4s, v11.4s, v0.s[2]\n"
407 "fmla v10.4s, v15.4s, v0.s[2]\n"
408 "str q4, [x24]\n"
409 "str q16, [x24, %[output_col_stride1]]\n"
410 "str q5, [x24, x11]\n"
411 "str q6, [x24, x13]\n"
412 "fmls v9.4s, v23.4s, v0.s[3]\n"
413 "fmls v10.4s, v24.4s, v0.s[3]\n"
414 "str q9, [x24, x23]\n"
415 "str q10, [x24, x15]\n"
416 "add x24, x24, #16\n"
417 "bge 1b\n"
418 "2:\n"
419 "cmp %w[n_channels], #2\n"
420 "blt 3f\n"
421 "ldr d8, [%[inptr0], x20]\n"
422 "mov v14.16b, v8.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000423 "ldr d2, [%[inptr0], x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000424 "mov v10.16b, v8.16b\n"
425 "ldr d9, [%[inptr0]]\n"
426 "fmla v14.4s, v9.4s, v0.s[2]\n"
427 "ldr d1, [%[inptr0], x21]\n"
428 "mov v9.16b, v8.16b\n"
429 "ldr d4, [%[inptr0], x19]\n"
430 "mov v7.16b, v8.16b\n"
431 "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
432 "fmls v14.4s, v2.4s, v0.s[3]\n"
433 "ldr d5, [x16, x20]\n"
434 "fmls v10.4s, v12.4s, v0.s[2]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000435 "ldr d20, [x16, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000436 "fmla v9.4s, v12.4s, v0.s[2]\n"
437 "ldr d3, [x16]\n"
438 "fmls v7.4s, v12.4s, v0.s[1]\n"
439 "ldr d6, [x16, x21]\n"
440 "fmls v10.4s, v2.4s, v0.s[2]\n"
441 "ldr d16, [x16, x19]\n"
442 "fmls v9.4s, v2.4s, v0.s[2]\n"
443 "ldr d22, [x16, %[input_col_stride1]]\n"
444 "fsub v7.4s, v7.4s, v2.4s\n"
445 "ldr d17, [x17, x20]\n"
446 "fadd v10.4s, v10.4s, v4.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000447 "ldr d15, [x17, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000448 "fsub v9.4s, v9.4s, v4.4s\n"
449 "ldr d19, [x17]\n"
450 "fmla v7.4s, v4.4s, v0.s[1]\n"
451 "ldr d18, [x17, x21]\n"
452 "mov v8.16b, v8.16b\n"
453 "ldr d13, [x17, x19]\n"
454 "mov v11.16b, v1.16b\n"
455 "ldr d21, [x17, %[input_col_stride1]]\n"
456 "fmla v8.4s, v12.4s, v0.s[1]\n"
457 "add %[inptr0], %[inptr0], #8\n"
458 "fmla v11.4s, v12.4s, v0.s[2]\n"
459 "add x16, x16, #8\n"
460 "mov v1.16b, v5.16b\n"
461 "add x17, x17, #8\n"
462 "fsub v8.4s, v8.4s, v2.4s\n"
463 "mov v2.16b, v5.16b\n"
464 "fmls v8.4s, v4.4s, v0.s[1]\n"
465 "fmls v11.4s, v4.4s, v0.s[3]\n"
466 "fmla v1.4s, v3.4s, v0.s[2]\n"
467 "fmls v2.4s, v22.4s, v0.s[2]\n"
468 "mov v3.16b, v5.16b\n"
469 "mov v4.16b, v5.16b\n"
470 "mov v5.16b, v5.16b\n"
471 "mov v6.16b, v6.16b\n"
472 "fmls v1.4s, v20.4s, v0.s[3]\n"
473 "fmls v2.4s, v20.4s, v0.s[2]\n"
474 "fmla v3.4s, v22.4s, v0.s[2]\n"
475 "fmls v4.4s, v22.4s, v0.s[1]\n"
476 "fmla v5.4s, v22.4s, v0.s[1]\n"
477 "fmla v6.4s, v22.4s, v0.s[2]\n"
478 "fadd v2.4s, v2.4s, v16.4s\n"
479 "mov v12.16b, v17.16b\n"
480 "fmls v3.4s, v20.4s, v0.s[2]\n"
481 "fsub v4.4s, v4.4s, v20.4s\n"
482 "fmla v4.4s, v16.4s, v0.s[1]\n"
483 "fsub v5.4s, v5.4s, v20.4s\n"
484 "fmls v5.4s, v16.4s, v0.s[1]\n"
485 "fmls v6.4s, v16.4s, v0.s[3]\n"
486 "fsub v3.4s, v3.4s, v16.4s\n"
487 "fmla v12.4s, v19.4s, v0.s[2]\n"
488 "mov v19.16b, v17.16b\n"
489 "mov v20.16b, v17.16b\n"
490 "mov v16.16b, v17.16b\n"
491 "mov v17.16b, v17.16b\n"
492 "fmls v12.4s, v15.4s, v0.s[3]\n"
493 "fmls v19.4s, v21.4s, v0.s[2]\n"
494 "fmla v20.4s, v21.4s, v0.s[2]\n"
495 "fmls v16.4s, v21.4s, v0.s[1]\n"
496 "fmla v17.4s, v21.4s, v0.s[1]\n"
497 "mov v18.16b, v18.16b\n"
498 "fmls v19.4s, v15.4s, v0.s[2]\n"
499 "mov v23.16b, v12.16b\n"
500 "fmls v20.4s, v15.4s, v0.s[2]\n"
501 "fsub v16.4s, v16.4s, v15.4s\n"
502 "fmla v16.4s, v13.4s, v0.s[1]\n"
503 "fsub v17.4s, v17.4s, v15.4s\n"
504 "fadd v19.4s, v19.4s, v13.4s\n"
505 "fmls v17.4s, v13.4s, v0.s[1]\n"
506 "fsub v20.4s, v20.4s, v13.4s\n"
507 "fmla v18.4s, v21.4s, v0.s[2]\n"
508 "fmla v23.4s, v14.4s, v0.s[2]\n"
509 "mov v15.16b, v19.16b\n"
510 "mov v14.16b, v20.16b\n"
511 "mov v24.16b, v16.16b\n"
512 "fmls v18.4s, v13.4s, v0.s[3]\n"
513 "fmla v15.4s, v10.4s, v0.s[2]\n"
514 "fmls v23.4s, v1.4s, v0.s[3]\n"
515 "fmla v14.4s, v9.4s, v0.s[2]\n"
516 "fmla v24.4s, v7.4s, v0.s[2]\n"
517 "mov v10.16b, v17.16b\n"
518 "fmls v15.4s, v2.4s, v0.s[3]\n"
519 "mov v7.16b, v18.16b\n"
520 "str d23, [%[outptr0]]\n"
521 "fmls v14.4s, v3.4s, v0.s[3]\n"
522 "fmls v24.4s, v4.4s, v0.s[3]\n"
523 "fmla v10.4s, v8.4s, v0.s[2]\n"
524 "str d15, [%[outptr0], %[output_col_stride1]]\n"
525 "fmla v7.4s, v11.4s, v0.s[2]\n"
526 "str d14, [%[outptr0], x11]\n"
527 "fmls v10.4s, v5.4s, v0.s[3]\n"
528 "str d24, [%[outptr0], x13]\n"
529 "fmls v7.4s, v6.4s, v0.s[3]\n"
530 "str d10, [%[outptr0], x23]\n"
531 "str d7, [%[outptr0], x15]\n"
532 "add %[outptr0], %[outptr0], #8\n"
533 "mov v26.16b, v12.16b\n"
534 "mov v25.16b, v19.16b\n"
535 "ldr d11, [x25, x20]\n"
536 "mov v10.16b, v11.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000537 "ldr d23, [x25, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000538 "mov v9.16b, v11.16b\n"
539 "ldr d7, [x25]\n"
540 "fmla v10.4s, v7.4s, v0.s[2]\n"
541 "ldr d13, [x25, x21]\n"
542 "mov v7.16b, v11.16b\n"
543 "ldr d31, [x25, x19]\n"
544 "mov v8.16b, v11.16b\n"
545 "ldr d21, [x25, %[input_col_stride1]]\n"
546 "fmls v10.4s, v23.4s, v0.s[3]\n"
547 "ldr d30, [x26, x20]\n"
548 "fmls v9.4s, v21.4s, v0.s[2]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000549 "ldr d29, [x26, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000550 "fmla v7.4s, v21.4s, v0.s[2]\n"
551 "ldr d22, [x26]\n"
552 "fmls v8.4s, v21.4s, v0.s[1]\n"
553 "ldr d24, [x26, x21]\n"
554 "fmls v9.4s, v23.4s, v0.s[2]\n"
555 "ldr d27, [x26, x19]\n"
556 "fmls v7.4s, v23.4s, v0.s[2]\n"
557 "ldr d28, [x26, %[input_col_stride1]]\n"
558 "fsub v8.4s, v8.4s, v23.4s\n"
559 "add x25, x25, #8\n"
560 "fadd v9.4s, v9.4s, v31.4s\n"
561 "add x26, x26, #8\n"
562 "fsub v7.4s, v7.4s, v31.4s\n"
563 "fmla v8.4s, v31.4s, v0.s[1]\n"
564 "mov v11.16b, v11.16b\n"
565 "mov v15.16b, v13.16b\n"
566 "mov v14.16b, v30.16b\n"
567 "mov v13.16b, v30.16b\n"
568 "fmla v11.4s, v21.4s, v0.s[1]\n"
569 "fmla v15.4s, v21.4s, v0.s[2]\n"
570 "fmla v14.4s, v22.4s, v0.s[2]\n"
571 "fmls v13.4s, v28.4s, v0.s[2]\n"
572 "mov v21.16b, v30.16b\n"
573 "mov v22.16b, v30.16b\n"
574 "fsub v11.4s, v11.4s, v23.4s\n"
575 "fmls v15.4s, v31.4s, v0.s[3]\n"
576 "fmls v11.4s, v31.4s, v0.s[1]\n"
577 "fmls v14.4s, v29.4s, v0.s[3]\n"
578 "fmls v13.4s, v29.4s, v0.s[2]\n"
579 "fmla v21.4s, v28.4s, v0.s[2]\n"
580 "fmls v22.4s, v28.4s, v0.s[1]\n"
581 "mov v23.16b, v30.16b\n"
582 "mov v24.16b, v24.16b\n"
583 "fmls v26.4s, v10.4s, v0.s[2]\n"
584 "fadd v13.4s, v13.4s, v27.4s\n"
585 "fmls v21.4s, v29.4s, v0.s[2]\n"
586 "fsub v22.4s, v22.4s, v29.4s\n"
587 "fmla v23.4s, v28.4s, v0.s[1]\n"
588 "fmla v22.4s, v27.4s, v0.s[1]\n"
589 "fmla v24.4s, v28.4s, v0.s[2]\n"
590 "fsub v21.4s, v21.4s, v27.4s\n"
591 "fmls v26.4s, v1.4s, v0.s[2]\n"
592 "fsub v23.4s, v23.4s, v29.4s\n"
593 "fmls v25.4s, v9.4s, v0.s[2]\n"
594 "fmls v23.4s, v27.4s, v0.s[1]\n"
595 "fmls v24.4s, v27.4s, v0.s[3]\n"
596 "fadd v26.4s, v26.4s, v14.4s\n"
597 "mov v27.16b, v20.16b\n"
598 "str d26, [x28]\n"
599 "fmls v25.4s, v2.4s, v0.s[2]\n"
600 "fmls v27.4s, v7.4s, v0.s[2]\n"
601 "mov v31.16b, v16.16b\n"
602 "mov v30.16b, v17.16b\n"
603 "mov v29.16b, v18.16b\n"
604 "fadd v25.4s, v25.4s, v13.4s\n"
605 "fmls v31.4s, v8.4s, v0.s[2]\n"
606 "str d25, [x28, %[output_col_stride1]]\n"
607 "fmls v27.4s, v3.4s, v0.s[2]\n"
608 "fmls v30.4s, v11.4s, v0.s[2]\n"
609 "fmls v29.4s, v15.4s, v0.s[2]\n"
610 "fmls v31.4s, v4.4s, v0.s[2]\n"
611 "mov v26.16b, v12.16b\n"
612 "fadd v27.4s, v27.4s, v21.4s\n"
613 "mov v25.16b, v19.16b\n"
614 "str d27, [x28, x11]\n"
615 "fmls v30.4s, v5.4s, v0.s[2]\n"
616 "fadd v31.4s, v31.4s, v22.4s\n"
617 "fmls v29.4s, v6.4s, v0.s[2]\n"
618 "str d31, [x28, x13]\n"
619 "fmla v26.4s, v10.4s, v0.s[2]\n"
620 "fadd v30.4s, v30.4s, v23.4s\n"
621 "fmla v25.4s, v9.4s, v0.s[2]\n"
622 "str d30, [x28, x23]\n"
623 "fadd v29.4s, v29.4s, v24.4s\n"
624 "str d29, [x28, x15]\n"
625 "fmls v26.4s, v1.4s, v0.s[2]\n"
626 "fmls v25.4s, v2.4s, v0.s[2]\n"
627 "add x28, x28, #8\n"
628 "mov v30.16b, v20.16b\n"
629 "mov v29.16b, v16.16b\n"
630 "fsub v26.4s, v26.4s, v14.4s\n"
631 "mov v28.16b, v17.16b\n"
632 "str d26, [x22]\n"
633 "fsub v25.4s, v25.4s, v13.4s\n"
634 "str d25, [x22, %[output_col_stride1]]\n"
635 "fmla v30.4s, v7.4s, v0.s[2]\n"
636 "fmla v29.4s, v8.4s, v0.s[2]\n"
637 "fmla v28.4s, v11.4s, v0.s[2]\n"
638 "mov v26.16b, v18.16b\n"
639 "mov v25.16b, v12.16b\n"
640 "fmls v30.4s, v3.4s, v0.s[2]\n"
641 "mov v31.16b, v19.16b\n"
642 "fmls v29.4s, v4.4s, v0.s[2]\n"
643 "fmls v28.4s, v5.4s, v0.s[2]\n"
644 "fmla v26.4s, v15.4s, v0.s[2]\n"
645 "fmls v25.4s, v10.4s, v0.s[1]\n"
646 "fsub v30.4s, v30.4s, v21.4s\n"
647 "fmls v31.4s, v9.4s, v0.s[1]\n"
648 "str d30, [x22, x11]\n"
649 "fsub v29.4s, v29.4s, v22.4s\n"
650 "str d29, [x22, x13]\n"
651 "fsub v28.4s, v28.4s, v23.4s\n"
652 "str d28, [x22, x23]\n"
653 "fmls v26.4s, v6.4s, v0.s[2]\n"
654 "fsub v25.4s, v25.4s, v1.4s\n"
655 "fsub v31.4s, v31.4s, v2.4s\n"
656 "fmla v25.4s, v14.4s, v0.s[1]\n"
657 "fmla v31.4s, v13.4s, v0.s[1]\n"
658 "fsub v26.4s, v26.4s, v24.4s\n"
659 "mov v27.16b, v20.16b\n"
660 "str d26, [x22, x15]\n"
661 "mov v26.16b, v16.16b\n"
662 "str d25, [x12]\n"
663 "fmls v27.4s, v7.4s, v0.s[1]\n"
664 "str d31, [x12, %[output_col_stride1]]\n"
665 "fmls v26.4s, v8.4s, v0.s[1]\n"
666 "mov v25.16b, v17.16b\n"
667 "add x22, x22, #8\n"
668 "fsub v27.4s, v27.4s, v3.4s\n"
669 "mov v28.16b, v18.16b\n"
670 "fmla v27.4s, v21.4s, v0.s[1]\n"
671 "fsub v26.4s, v26.4s, v4.4s\n"
672 "fmla v26.4s, v22.4s, v0.s[1]\n"
673 "fmls v25.4s, v11.4s, v0.s[1]\n"
674 "fmls v28.4s, v15.4s, v0.s[1]\n"
675 "mov v12.16b, v12.16b\n"
676 "str d27, [x12, x11]\n"
677 "mov v19.16b, v19.16b\n"
678 "str d26, [x12, x13]\n"
679 "fsub v25.4s, v25.4s, v5.4s\n"
680 "fmla v25.4s, v23.4s, v0.s[1]\n"
681 "fsub v28.4s, v28.4s, v6.4s\n"
682 "fmla v28.4s, v24.4s, v0.s[1]\n"
683 "fmla v12.4s, v10.4s, v0.s[1]\n"
684 "fmla v19.4s, v9.4s, v0.s[1]\n"
685 "mov v20.16b, v20.16b\n"
686 "str d25, [x12, x23]\n"
687 "mov v16.16b, v16.16b\n"
688 "str d28, [x12, x15]\n"
689 "fsub v12.4s, v12.4s, v1.4s\n"
690 "fmls v12.4s, v14.4s, v0.s[1]\n"
691 "add x12, x12, #8\n"
692 "fsub v19.4s, v19.4s, v2.4s\n"
693 "fmla v20.4s, v7.4s, v0.s[1]\n"
694 "fmls v19.4s, v13.4s, v0.s[1]\n"
695 "fmla v16.4s, v8.4s, v0.s[1]\n"
696 "str d12, [x14]\n"
697 "mov v1.16b, v17.16b\n"
698 "fsub v20.4s, v20.4s, v3.4s\n"
699 "mov v17.16b, v18.16b\n"
700 "str d19, [x14, %[output_col_stride1]]\n"
701 "fmls v20.4s, v21.4s, v0.s[1]\n"
702 "fsub v16.4s, v16.4s, v4.4s\n"
703 "fmla v1.4s, v11.4s, v0.s[1]\n"
704 "fmls v16.4s, v22.4s, v0.s[1]\n"
705 "fmla v17.4s, v15.4s, v0.s[1]\n"
706 "str d20, [x14, x11]\n"
707 "fsub v1.4s, v1.4s, v5.4s\n"
708 "str d16, [x14, x13]\n"
709 "fmls v1.4s, v23.4s, v0.s[1]\n"
710 "fsub v17.4s, v17.4s, v6.4s\n"
711 "fmls v17.4s, v24.4s, v0.s[1]\n"
712 "str d1, [x14, x23]\n"
713 "str d17, [x14, x15]\n"
714 "add x14, x14, #8\n"
715 "ldr d2, [x27, x20]\n"
716 "mov v4.16b, v2.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000717 "ldr d17, [x27, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000718 "mov v12.16b, v2.16b\n"
719 "ldr d18, [x27]\n"
720 "fmla v4.4s, v18.4s, v0.s[2]\n"
721 "ldr d3, [x27, x21]\n"
722 "mov v6.16b, v2.16b\n"
723 "ldr d5, [x27, x19]\n"
724 "mov v1.16b, v2.16b\n"
725 "ldr d18, [x27, %[input_col_stride1]]\n"
726 "fmls v4.4s, v17.4s, v0.s[3]\n"
727 "add x27, x27, #8\n"
728 "fmls v12.4s, v18.4s, v0.s[2]\n"
729 "sub %w[n_channels], %w[n_channels], #2\n"
730 "fmla v6.4s, v18.4s, v0.s[2]\n"
731 "fmls v1.4s, v18.4s, v0.s[1]\n"
732 "mov v2.16b, v2.16b\n"
733 "mov v3.16b, v3.16b\n"
734 "fmls v12.4s, v17.4s, v0.s[2]\n"
735 "mov v4.16b, v4.16b\n"
736 "fmls v6.4s, v17.4s, v0.s[2]\n"
737 "fsub v1.4s, v1.4s, v17.4s\n"
738 "fmla v1.4s, v5.4s, v0.s[1]\n"
739 "fmla v2.4s, v18.4s, v0.s[1]\n"
740 "fadd v12.4s, v12.4s, v5.4s\n"
741 "fmla v3.4s, v18.4s, v0.s[2]\n"
742 "fsub v6.4s, v6.4s, v5.4s\n"
743 "fmla v4.4s, v10.4s, v0.s[2]\n"
744 "fsub v2.4s, v2.4s, v17.4s\n"
745 "mov v16.16b, v12.16b\n"
746 "fmls v2.4s, v5.4s, v0.s[1]\n"
747 "fmls v3.4s, v5.4s, v0.s[3]\n"
748 "fmls v4.4s, v14.4s, v0.s[3]\n"
749 "fmla v16.4s, v9.4s, v0.s[2]\n"
750 "mov v5.16b, v6.16b\n"
751 "mov v6.16b, v1.16b\n"
752 "mov v9.16b, v2.16b\n"
753 "mov v10.16b, v3.16b\n"
754 "str d4, [x24]\n"
755 "fmls v16.4s, v13.4s, v0.s[3]\n"
756 "fmla v5.4s, v7.4s, v0.s[2]\n"
757 "fmla v6.4s, v8.4s, v0.s[2]\n"
758 "fmla v9.4s, v11.4s, v0.s[2]\n"
759 "fmla v10.4s, v15.4s, v0.s[2]\n"
760 "str d16, [x24, %[output_col_stride1]]\n"
761 "fmls v5.4s, v21.4s, v0.s[3]\n"
762 "fmls v6.4s, v22.4s, v0.s[3]\n"
763 "fmls v9.4s, v23.4s, v0.s[3]\n"
764 "fmls v10.4s, v24.4s, v0.s[3]\n"
765 "str d5, [x24, x11]\n"
766 "str d6, [x24, x13]\n"
767 "str d9, [x24, x23]\n"
768 "str d10, [x24, x15]\n"
769 "add x24, x24, #8\n"
770 "3:\n"
771 "cbz %w[n_channels], 4f\n"
772 "ldr s8, [%[inptr0], x20]\n"
773 "mov v14.16b, v8.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000774 "ldr s2, [%[inptr0], x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000775 "mov v10.16b, v8.16b\n"
776 "ldr s9, [%[inptr0]]\n"
777 "fmla v14.4s, v9.4s, v0.s[2]\n"
778 "ldr s1, [%[inptr0], x21]\n"
779 "mov v9.16b, v8.16b\n"
780 "ldr s4, [%[inptr0], x19]\n"
781 "mov v7.16b, v8.16b\n"
782 "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
783 "fmls v14.4s, v2.4s, v0.s[3]\n"
784 "ldr s5, [x16, x20]\n"
785 "fmls v10.4s, v12.4s, v0.s[2]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000786 "ldr s20, [x16, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000787 "fmla v9.4s, v12.4s, v0.s[2]\n"
788 "ldr s3, [x16]\n"
789 "fmls v7.4s, v12.4s, v0.s[1]\n"
790 "ldr s6, [x16, x21]\n"
791 "fmls v10.4s, v2.4s, v0.s[2]\n"
792 "ldr s16, [x16, x19]\n"
793 "fmls v9.4s, v2.4s, v0.s[2]\n"
794 "ldr s22, [x16, %[input_col_stride1]]\n"
795 "fsub v7.4s, v7.4s, v2.4s\n"
796 "ldr s17, [x17, x20]\n"
797 "fadd v10.4s, v10.4s, v4.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000798 "ldr s15, [x17, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000799 "fsub v9.4s, v9.4s, v4.4s\n"
800 "ldr s19, [x17]\n"
801 "fmla v7.4s, v4.4s, v0.s[1]\n"
802 "ldr s18, [x17, x21]\n"
803 "mov v8.16b, v8.16b\n"
804 "ldr s13, [x17, x19]\n"
805 "mov v11.16b, v1.16b\n"
806 "ldr s21, [x17, %[input_col_stride1]]\n"
807 "fmla v8.4s, v12.4s, v0.s[1]\n"
808 "add %[inptr0], %[inptr0], #4\n"
809 "fmla v11.4s, v12.4s, v0.s[2]\n"
810 "add x16, x16, #4\n"
811 "mov v1.16b, v5.16b\n"
812 "add x17, x17, #4\n"
813 "fsub v8.4s, v8.4s, v2.4s\n"
814 "mov v2.16b, v5.16b\n"
815 "fmls v8.4s, v4.4s, v0.s[1]\n"
816 "fmls v11.4s, v4.4s, v0.s[3]\n"
817 "fmla v1.4s, v3.4s, v0.s[2]\n"
818 "fmls v2.4s, v22.4s, v0.s[2]\n"
819 "mov v3.16b, v5.16b\n"
820 "mov v4.16b, v5.16b\n"
821 "mov v5.16b, v5.16b\n"
822 "mov v6.16b, v6.16b\n"
823 "fmls v1.4s, v20.4s, v0.s[3]\n"
824 "fmls v2.4s, v20.4s, v0.s[2]\n"
825 "fmla v3.4s, v22.4s, v0.s[2]\n"
826 "fmls v4.4s, v22.4s, v0.s[1]\n"
827 "fmla v5.4s, v22.4s, v0.s[1]\n"
828 "fmla v6.4s, v22.4s, v0.s[2]\n"
829 "fadd v2.4s, v2.4s, v16.4s\n"
830 "mov v12.16b, v17.16b\n"
831 "fmls v3.4s, v20.4s, v0.s[2]\n"
832 "fsub v4.4s, v4.4s, v20.4s\n"
833 "fmla v4.4s, v16.4s, v0.s[1]\n"
834 "fsub v5.4s, v5.4s, v20.4s\n"
835 "fmls v5.4s, v16.4s, v0.s[1]\n"
836 "fmls v6.4s, v16.4s, v0.s[3]\n"
837 "fsub v3.4s, v3.4s, v16.4s\n"
838 "fmla v12.4s, v19.4s, v0.s[2]\n"
839 "mov v19.16b, v17.16b\n"
840 "mov v20.16b, v17.16b\n"
841 "mov v16.16b, v17.16b\n"
842 "mov v17.16b, v17.16b\n"
843 "fmls v12.4s, v15.4s, v0.s[3]\n"
844 "fmls v19.4s, v21.4s, v0.s[2]\n"
845 "fmla v20.4s, v21.4s, v0.s[2]\n"
846 "fmls v16.4s, v21.4s, v0.s[1]\n"
847 "fmla v17.4s, v21.4s, v0.s[1]\n"
848 "mov v18.16b, v18.16b\n"
849 "fmls v19.4s, v15.4s, v0.s[2]\n"
850 "mov v23.16b, v12.16b\n"
851 "fmls v20.4s, v15.4s, v0.s[2]\n"
852 "fsub v16.4s, v16.4s, v15.4s\n"
853 "fmla v16.4s, v13.4s, v0.s[1]\n"
854 "fsub v17.4s, v17.4s, v15.4s\n"
855 "fadd v19.4s, v19.4s, v13.4s\n"
856 "fmls v17.4s, v13.4s, v0.s[1]\n"
857 "fsub v20.4s, v20.4s, v13.4s\n"
858 "fmla v18.4s, v21.4s, v0.s[2]\n"
859 "fmla v23.4s, v14.4s, v0.s[2]\n"
860 "mov v15.16b, v19.16b\n"
861 "mov v14.16b, v20.16b\n"
862 "mov v24.16b, v16.16b\n"
863 "fmls v18.4s, v13.4s, v0.s[3]\n"
864 "fmla v15.4s, v10.4s, v0.s[2]\n"
865 "fmls v23.4s, v1.4s, v0.s[3]\n"
866 "fmla v14.4s, v9.4s, v0.s[2]\n"
867 "fmla v24.4s, v7.4s, v0.s[2]\n"
868 "mov v10.16b, v17.16b\n"
869 "fmls v15.4s, v2.4s, v0.s[3]\n"
870 "mov v7.16b, v18.16b\n"
871 "str s23, [%[outptr0]]\n"
872 "fmls v14.4s, v3.4s, v0.s[3]\n"
873 "fmls v24.4s, v4.4s, v0.s[3]\n"
874 "fmla v10.4s, v8.4s, v0.s[2]\n"
875 "str s15, [%[outptr0], %[output_col_stride1]]\n"
876 "fmla v7.4s, v11.4s, v0.s[2]\n"
877 "str s14, [%[outptr0], x11]\n"
878 "fmls v10.4s, v5.4s, v0.s[3]\n"
879 "str s24, [%[outptr0], x13]\n"
880 "fmls v7.4s, v6.4s, v0.s[3]\n"
881 "str s10, [%[outptr0], x23]\n"
882 "str s7, [%[outptr0], x15]\n"
883 "add %[outptr0], %[outptr0], #4\n"
884 "mov v26.16b, v12.16b\n"
885 "mov v25.16b, v19.16b\n"
886 "ldr s11, [x25, x20]\n"
887 "mov v10.16b, v11.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000888 "ldr s23, [x25, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000889 "mov v9.16b, v11.16b\n"
890 "ldr s7, [x25]\n"
891 "fmla v10.4s, v7.4s, v0.s[2]\n"
892 "ldr s13, [x25, x21]\n"
893 "mov v7.16b, v11.16b\n"
894 "ldr s31, [x25, x19]\n"
895 "mov v8.16b, v11.16b\n"
896 "ldr s21, [x25, %[input_col_stride1]]\n"
897 "fmls v10.4s, v23.4s, v0.s[3]\n"
898 "ldr s30, [x26, x20]\n"
899 "fmls v9.4s, v21.4s, v0.s[2]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000900 "ldr s29, [x26, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +0000901 "fmla v7.4s, v21.4s, v0.s[2]\n"
902 "ldr s22, [x26]\n"
903 "fmls v8.4s, v21.4s, v0.s[1]\n"
904 "ldr s24, [x26, x21]\n"
905 "fmls v9.4s, v23.4s, v0.s[2]\n"
906 "ldr s27, [x26, x19]\n"
907 "fmls v7.4s, v23.4s, v0.s[2]\n"
908 "ldr s28, [x26, %[input_col_stride1]]\n"
909 "fsub v8.4s, v8.4s, v23.4s\n"
910 "add x25, x25, #4\n"
911 "fadd v9.4s, v9.4s, v31.4s\n"
912 "add x26, x26, #4\n"
913 "fsub v7.4s, v7.4s, v31.4s\n"
914 "fmla v8.4s, v31.4s, v0.s[1]\n"
915 "mov v11.16b, v11.16b\n"
916 "mov v15.16b, v13.16b\n"
917 "mov v14.16b, v30.16b\n"
918 "mov v13.16b, v30.16b\n"
919 "fmla v11.4s, v21.4s, v0.s[1]\n"
920 "fmla v15.4s, v21.4s, v0.s[2]\n"
921 "fmla v14.4s, v22.4s, v0.s[2]\n"
922 "fmls v13.4s, v28.4s, v0.s[2]\n"
923 "mov v21.16b, v30.16b\n"
924 "mov v22.16b, v30.16b\n"
925 "fsub v11.4s, v11.4s, v23.4s\n"
926 "fmls v15.4s, v31.4s, v0.s[3]\n"
927 "fmls v11.4s, v31.4s, v0.s[1]\n"
928 "fmls v14.4s, v29.4s, v0.s[3]\n"
929 "fmls v13.4s, v29.4s, v0.s[2]\n"
930 "fmla v21.4s, v28.4s, v0.s[2]\n"
931 "fmls v22.4s, v28.4s, v0.s[1]\n"
932 "mov v23.16b, v30.16b\n"
933 "mov v24.16b, v24.16b\n"
934 "fmls v26.4s, v10.4s, v0.s[2]\n"
935 "fadd v13.4s, v13.4s, v27.4s\n"
936 "fmls v21.4s, v29.4s, v0.s[2]\n"
937 "fsub v22.4s, v22.4s, v29.4s\n"
938 "fmla v23.4s, v28.4s, v0.s[1]\n"
939 "fmla v22.4s, v27.4s, v0.s[1]\n"
940 "fmla v24.4s, v28.4s, v0.s[2]\n"
941 "fsub v21.4s, v21.4s, v27.4s\n"
942 "fmls v26.4s, v1.4s, v0.s[2]\n"
943 "fsub v23.4s, v23.4s, v29.4s\n"
944 "fmls v25.4s, v9.4s, v0.s[2]\n"
945 "fmls v23.4s, v27.4s, v0.s[1]\n"
946 "fmls v24.4s, v27.4s, v0.s[3]\n"
947 "fadd v26.4s, v26.4s, v14.4s\n"
948 "mov v27.16b, v20.16b\n"
949 "str s26, [x28]\n"
950 "fmls v25.4s, v2.4s, v0.s[2]\n"
951 "fmls v27.4s, v7.4s, v0.s[2]\n"
952 "mov v31.16b, v16.16b\n"
953 "mov v30.16b, v17.16b\n"
954 "mov v29.16b, v18.16b\n"
955 "fadd v25.4s, v25.4s, v13.4s\n"
956 "fmls v31.4s, v8.4s, v0.s[2]\n"
957 "str s25, [x28, %[output_col_stride1]]\n"
958 "fmls v27.4s, v3.4s, v0.s[2]\n"
959 "fmls v30.4s, v11.4s, v0.s[2]\n"
960 "fmls v29.4s, v15.4s, v0.s[2]\n"
961 "fmls v31.4s, v4.4s, v0.s[2]\n"
962 "mov v26.16b, v12.16b\n"
963 "fadd v27.4s, v27.4s, v21.4s\n"
964 "mov v25.16b, v19.16b\n"
965 "str s27, [x28, x11]\n"
966 "fmls v30.4s, v5.4s, v0.s[2]\n"
967 "fadd v31.4s, v31.4s, v22.4s\n"
968 "fmls v29.4s, v6.4s, v0.s[2]\n"
969 "str s31, [x28, x13]\n"
970 "fmla v26.4s, v10.4s, v0.s[2]\n"
971 "fadd v30.4s, v30.4s, v23.4s\n"
972 "fmla v25.4s, v9.4s, v0.s[2]\n"
973 "str s30, [x28, x23]\n"
974 "fadd v29.4s, v29.4s, v24.4s\n"
975 "str s29, [x28, x15]\n"
976 "fmls v26.4s, v1.4s, v0.s[2]\n"
977 "fmls v25.4s, v2.4s, v0.s[2]\n"
978 "add x28, x28, #4\n"
979 "mov v30.16b, v20.16b\n"
980 "mov v29.16b, v16.16b\n"
981 "fsub v26.4s, v26.4s, v14.4s\n"
982 "mov v28.16b, v17.16b\n"
983 "str s26, [x22]\n"
984 "fsub v25.4s, v25.4s, v13.4s\n"
985 "str s25, [x22, %[output_col_stride1]]\n"
986 "fmla v30.4s, v7.4s, v0.s[2]\n"
987 "fmla v29.4s, v8.4s, v0.s[2]\n"
988 "fmla v28.4s, v11.4s, v0.s[2]\n"
989 "mov v26.16b, v18.16b\n"
990 "mov v25.16b, v12.16b\n"
991 "fmls v30.4s, v3.4s, v0.s[2]\n"
992 "mov v31.16b, v19.16b\n"
993 "fmls v29.4s, v4.4s, v0.s[2]\n"
994 "fmls v28.4s, v5.4s, v0.s[2]\n"
995 "fmla v26.4s, v15.4s, v0.s[2]\n"
996 "fmls v25.4s, v10.4s, v0.s[1]\n"
997 "fsub v30.4s, v30.4s, v21.4s\n"
998 "fmls v31.4s, v9.4s, v0.s[1]\n"
999 "str s30, [x22, x11]\n"
1000 "fsub v29.4s, v29.4s, v22.4s\n"
1001 "str s29, [x22, x13]\n"
1002 "fsub v28.4s, v28.4s, v23.4s\n"
1003 "str s28, [x22, x23]\n"
1004 "fmls v26.4s, v6.4s, v0.s[2]\n"
1005 "fsub v25.4s, v25.4s, v1.4s\n"
1006 "fsub v31.4s, v31.4s, v2.4s\n"
1007 "fmla v25.4s, v14.4s, v0.s[1]\n"
1008 "fmla v31.4s, v13.4s, v0.s[1]\n"
1009 "fsub v26.4s, v26.4s, v24.4s\n"
1010 "mov v27.16b, v20.16b\n"
1011 "str s26, [x22, x15]\n"
1012 "mov v26.16b, v16.16b\n"
1013 "str s25, [x12]\n"
1014 "fmls v27.4s, v7.4s, v0.s[1]\n"
1015 "str s31, [x12, %[output_col_stride1]]\n"
1016 "fmls v26.4s, v8.4s, v0.s[1]\n"
1017 "mov v25.16b, v17.16b\n"
1018 "add x22, x22, #4\n"
1019 "fsub v27.4s, v27.4s, v3.4s\n"
1020 "mov v28.16b, v18.16b\n"
1021 "fmla v27.4s, v21.4s, v0.s[1]\n"
1022 "fsub v26.4s, v26.4s, v4.4s\n"
1023 "fmla v26.4s, v22.4s, v0.s[1]\n"
1024 "fmls v25.4s, v11.4s, v0.s[1]\n"
1025 "fmls v28.4s, v15.4s, v0.s[1]\n"
1026 "mov v12.16b, v12.16b\n"
1027 "str s27, [x12, x11]\n"
1028 "mov v19.16b, v19.16b\n"
1029 "str s26, [x12, x13]\n"
1030 "fsub v25.4s, v25.4s, v5.4s\n"
1031 "fmla v25.4s, v23.4s, v0.s[1]\n"
1032 "fsub v28.4s, v28.4s, v6.4s\n"
1033 "fmla v28.4s, v24.4s, v0.s[1]\n"
1034 "fmla v12.4s, v10.4s, v0.s[1]\n"
1035 "fmla v19.4s, v9.4s, v0.s[1]\n"
1036 "mov v20.16b, v20.16b\n"
1037 "str s25, [x12, x23]\n"
1038 "mov v16.16b, v16.16b\n"
1039 "str s28, [x12, x15]\n"
1040 "fsub v12.4s, v12.4s, v1.4s\n"
1041 "fmls v12.4s, v14.4s, v0.s[1]\n"
1042 "add x12, x12, #4\n"
1043 "fsub v19.4s, v19.4s, v2.4s\n"
1044 "fmla v20.4s, v7.4s, v0.s[1]\n"
1045 "fmls v19.4s, v13.4s, v0.s[1]\n"
1046 "fmla v16.4s, v8.4s, v0.s[1]\n"
1047 "str s12, [x14]\n"
1048 "mov v1.16b, v17.16b\n"
1049 "fsub v20.4s, v20.4s, v3.4s\n"
1050 "mov v17.16b, v18.16b\n"
1051 "str s19, [x14, %[output_col_stride1]]\n"
1052 "fmls v20.4s, v21.4s, v0.s[1]\n"
1053 "fsub v16.4s, v16.4s, v4.4s\n"
1054 "fmla v1.4s, v11.4s, v0.s[1]\n"
1055 "fmls v16.4s, v22.4s, v0.s[1]\n"
1056 "fmla v17.4s, v15.4s, v0.s[1]\n"
1057 "str s20, [x14, x11]\n"
1058 "fsub v1.4s, v1.4s, v5.4s\n"
1059 "str s16, [x14, x13]\n"
1060 "fmls v1.4s, v23.4s, v0.s[1]\n"
1061 "fsub v17.4s, v17.4s, v6.4s\n"
1062 "fmls v17.4s, v24.4s, v0.s[1]\n"
1063 "str s1, [x14, x23]\n"
1064 "str s17, [x14, x15]\n"
1065 "add x14, x14, #4\n"
1066 "ldr s2, [x27, x20]\n"
1067 "mov v4.16b, v2.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +00001068 "ldr s17, [x27, x9]\n"
Pablo Tello8f43d742019-03-27 09:28:32 +00001069 "mov v12.16b, v2.16b\n"
1070 "ldr s18, [x27]\n"
1071 "fmla v4.4s, v18.4s, v0.s[2]\n"
1072 "ldr s3, [x27, x21]\n"
1073 "mov v6.16b, v2.16b\n"
1074 "ldr s5, [x27, x19]\n"
1075 "mov v1.16b, v2.16b\n"
1076 "ldr s18, [x27, %[input_col_stride1]]\n"
1077 "fmls v4.4s, v17.4s, v0.s[3]\n"
1078 "add x27, x27, #4\n"
1079 "fmls v12.4s, v18.4s, v0.s[2]\n"
1080 "fmla v6.4s, v18.4s, v0.s[2]\n"
1081 "fmls v1.4s, v18.4s, v0.s[1]\n"
1082 "mov v2.16b, v2.16b\n"
1083 "mov v3.16b, v3.16b\n"
1084 "mov v4.16b, v4.16b\n"
1085 "fmls v12.4s, v17.4s, v0.s[2]\n"
1086 "fmls v6.4s, v17.4s, v0.s[2]\n"
1087 "fsub v1.4s, v1.4s, v17.4s\n"
1088 "fmla v2.4s, v18.4s, v0.s[1]\n"
1089 "fmla v1.4s, v5.4s, v0.s[1]\n"
1090 "fmla v3.4s, v18.4s, v0.s[2]\n"
1091 "fadd v12.4s, v12.4s, v5.4s\n"
1092 "fsub v6.4s, v6.4s, v5.4s\n"
1093 "fsub v2.4s, v2.4s, v17.4s\n"
1094 "fmla v4.4s, v10.4s, v0.s[2]\n"
1095 "fmls v2.4s, v5.4s, v0.s[1]\n"
1096 "fmls v3.4s, v5.4s, v0.s[3]\n"
1097 "mov v16.16b, v12.16b\n"
1098 "mov v5.16b, v6.16b\n"
1099 "fmls v4.4s, v14.4s, v0.s[3]\n"
1100 "mov v6.16b, v1.16b\n"
1101 "fmla v16.4s, v9.4s, v0.s[2]\n"
1102 "fmla v5.4s, v7.4s, v0.s[2]\n"
1103 "fmla v6.4s, v8.4s, v0.s[2]\n"
1104 "mov v9.16b, v2.16b\n"
1105 "str s4, [x24]\n"
1106 "mov v10.16b, v3.16b\n"
1107 "fmls v16.4s, v13.4s, v0.s[3]\n"
1108 "fmls v5.4s, v21.4s, v0.s[3]\n"
1109 "fmls v6.4s, v22.4s, v0.s[3]\n"
1110 "fmla v9.4s, v11.4s, v0.s[2]\n"
1111 "fmla v10.4s, v15.4s, v0.s[2]\n"
1112 "str s16, [x24, %[output_col_stride1]]\n"
1113 "str s5, [x24, x11]\n"
1114 "fmls v9.4s, v23.4s, v0.s[3]\n"
1115 "str s6, [x24, x13]\n"
1116 "fmls v10.4s, v24.4s, v0.s[3]\n"
1117 "str s9, [x24, x23]\n"
1118 "str s10, [x24, x15]\n"
1119 "add x24, x24, #4\n"
1120 "4:\n"
1121 : [outptr0] "+r" (matrix_base),
1122 [n_channels] "+r" (n_channels),
1123 [inptr0] "+r" (input_base)
1124 : [pcoeffs] "r" (pcoeffs),
1125 [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
1126 [output_col_stride1] "r" (matrix_stride * sizeof(float)),
1127 [input_row_stride] "r" (input_row_stride * sizeof(float)),
1128 [input_col_stride1] "r" (input_col_stride * sizeof(float))
1129 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
1130 "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1131 "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
Georgios Pinitasce3a7b22020-03-10 15:33:57 +00001132 "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x9", "x19",
Pablo Tello8f43d742019-03-27 09:28:32 +00001133 "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
1134 );
1135}
1136
1137#else // __arm__ not __aarch64__
1138
1139template <>
1140void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
1141 const int n_channels,
1142 const float* const input_base,
1143 const int input_row_stride,
1144 const int input_col_stride,
1145 float* outptr,
1146 const int matrix_stride
1147)
1148{
1149 constexpr int inner_tile_rows = 6;
1150 constexpr int inner_tile_cols = 6;
1151
1152 // Get pointers into the input tile
1153 const float *x_ptrs[inner_tile_rows][inner_tile_cols];
1154 for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
1155 {
1156 // Get a pointer into the row
1157 const float* const row_ptr = input_base + xi*input_row_stride;
1158
1159 for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
1160 {
1161 x_ptrs[i][j] = row_ptr + xj*input_col_stride;
1162 }
1163 }
1164
1165 // Matrices used/computed in this kernel.
1166 float x[inner_tile_rows][inner_tile_cols];
1167 float XTx[inner_tile_rows][inner_tile_cols];
1168 float U[inner_tile_rows][inner_tile_cols];
1169 for (int i = 0; i < inner_tile_rows; i++)
1170 {
1171 for (int j = 0; j < inner_tile_cols; j++)
1172 {
1173 x[i][j] = XTx[i][j] = 0.0f;
1174 }
1175 }
1176
1177 // Perform the Winograd input transformation for each channel in the input
1178 // tensor.
1179 int channels_remaining = n_channels;
1180 for (; channels_remaining >= 2; channels_remaining -= 2)
1181 {
1182 // Matrices used/computed in this kernel
1183 float32x2_t x[inner_tile_rows][inner_tile_cols];
1184 float32x2_t XTx[inner_tile_rows][inner_tile_cols];
1185 float32x2_t U[inner_tile_rows][inner_tile_cols];
1186 for (int i = 0; i < inner_tile_rows; i++)
1187 {
1188 for (int j = 0; j < inner_tile_cols; j++)
1189 {
1190 x[i][j] = vdup_n_f32(0.0f);
1191 XTx[i][j] = vdup_n_f32(0.0f);
1192 }
1193 }
1194
1195 // Read a 6x6 tile in the Winograd domain
1196 for (int i = 0; i < inner_tile_rows; i++)
1197 {
1198 for (int j = 0; j < inner_tile_cols; j++)
1199 {
1200 x[i][j] = vld1_f32(x_ptrs[i][j]);
1201 x_ptrs[i][j] += 2;
1202 }
1203 }
1204
1205 // Compute XT . x
1206 for (int j = 0; j < inner_tile_cols; j++)
1207 {
1208 // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
1209 XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
1210
1211 // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
1212 XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
1213
1214 // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
1215 XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
1216
1217 // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
1218 XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
1219
1220 // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
1221 XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
1222
1223 // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
1224 XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
1225 }
1226
1227 // Compute U = XT . x . X
1228 for (int i = 0; i < inner_tile_rows; i++)
1229 {
1230 // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
1231 U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
1232
1233 // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
1234 U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
1235
1236 // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
1237 U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
1238
1239 // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
1240 U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
1241
1242 // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
1243 U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
1244
1245 // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
1246 U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
1247 }
1248
1249 // Store the transformed matrix
1250 for (int i = 0, m = 0; i < inner_tile_rows; i++)
1251 {
1252 for (int j = 0; j < inner_tile_cols; j++, m++)
1253 {
1254 vst1_f32(outptr + m*matrix_stride, U[i][j]);
1255 }
1256 }
1257 outptr += 2;
1258 }
1259 for (; channels_remaining; channels_remaining--)
1260 {
1261 // Load x
1262 for (int i = 0; i < inner_tile_rows; i++)
1263 {
1264 for (int j = 0; j < inner_tile_cols; j++)
1265 {
1266 x[i][j] = *(x_ptrs[i][j]++);
1267 }
1268 }
1269
1270 // Compute XT . x
1271 for (int j = 0; j < inner_tile_cols; j++)
1272 {
1273 XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
1274 XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
1275 XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
1276 XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
1277 XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
1278 XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
1279 }
1280
1281 // Compute U = XT . x . X
1282 for (int i = 0; i < inner_tile_rows; i++)
1283 {
1284 U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
1285 U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
1286 U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
1287 U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
1288 U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
1289 U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
1290 }
1291
1292 // Store the transformed matrix
1293 for (int i = 0, m = 0; i < inner_tile_rows; i++)
1294 {
1295 for (int j = 0; j < inner_tile_cols; j++, m++)
1296 {
1297 *(outptr + m*matrix_stride) = U[i][j];
1298 }
1299 }
1300 outptr++;
1301 }
1302}
1303
1304#endif
1305
1306template class InputTransform<6, 6, float, float, WinogradRoots::Integers>;
1307
1308} // namespace winograd