blob: 010dd81bce44eea6ebe6bedf3d3eec4eaf8d4966 [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas20c246a2018-09-12 16:45:53 +010024#include "impl_fp32_fp32.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000025
26namespace depthwise
27{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000028
29using namespace neon_convolution_kernels;
30using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
31
32#ifdef __aarch64__
33template <>
34template <>
35void Conv::execute_tile<ActivationFunction::None>(
36 int n_channels,
37 const void *weight_bias_ptr,
38 const float *input,
39 const unsigned int input_row_stride,
40 const unsigned int input_col_stride,
41 float *output,
42 const unsigned int output_row_stride,
43 const unsigned int output_col_stride
44)
45{
46 __asm __volatile(
47 "add x23, %[inptr0], %[input_row_stride]\n"
48 "add x19, %[input_col_stride1], %[input_col_stride1]\n"
49 "add x22, %[outptr0], %[output_row_stride]\n"
50 "add x24, x23, %[input_row_stride]\n"
51 "add x20, x19, %[input_col_stride1]\n"
52 "and x27, %[n_channels], #3\n"
53 "add x25, x24, %[input_row_stride]\n"
54 "add x21, x20, %[input_col_stride1]\n"
55 "lsr x28, %[n_channels], #2\n"
56 "add x26, x25, %[input_row_stride]\n"
57 "cbz x28, 4f\n"
58 "1:\n"
59 "ldr q14, [%[wbptr]]\n"
60 "subs x28, x28, #1\n"
61 "mov v12.16b, v14.16b\n"
62 "ldr q8, [%[wbptr], #16]\n"
63 "mov v10.16b, v14.16b\n"
64 "ldr q7, [%[wbptr], #32]\n"
65 "mov v11.16b, v14.16b\n"
66 "ldr q6, [%[wbptr], #48]\n"
67 "mov v9.16b, v14.16b\n"
68 "ldr q5, [%[wbptr], #64]\n"
69 "ldr q4, [%[wbptr], #80]\n"
70 "ldr q3, [%[wbptr], #96]\n"
71 "ldr q2, [%[wbptr], #112]\n"
72 "ldr q1, [%[wbptr], #128]\n"
73 "ldr q0, [%[wbptr], #144]\n"
74 "ldr q15, [%[inptr0]]\n"
75 "fmla v12.4s, v15.4s, v8.4s\n"
76 "ldr q20, [x23]\n"
77 "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
78 "ldr q17, [x24]\n"
79 "fmla v10.4s, v17.4s, v8.4s\n"
80 "ldr q16, [x23, %[input_col_stride1]]\n"
81 "fmla v12.4s, v20.4s, v5.4s\n"
82 "ldr q18, [%[inptr0], x19]\n"
83 "ldr q14, [x25]\n"
84 "ldr q15, [x24, %[input_col_stride1]]\n"
85 "fmla v12.4s, v13.4s, v7.4s\n"
86 "fmla v12.4s, v17.4s, v2.4s\n"
87 "fmla v12.4s, v16.4s, v4.4s\n"
88 "fmla v12.4s, v18.4s, v6.4s\n"
89 "beq 3f\n"
90 "2:\n"
91 "fmla v11.4s, v18.4s, v8.4s\n"
92 "ldr q19, [x23, x19]\n"
93 "fmla v10.4s, v14.4s, v5.4s\n"
94 "ldr q20, [%[inptr0], x20]\n"
95 "fmla v12.4s, v15.4s, v1.4s\n"
96 "ldr q14, [x26]\n"
97 "fmla v11.4s, v19.4s, v5.4s\n"
98 "ldr q13, [x25, %[input_col_stride1]]\n"
99 "fmla v10.4s, v15.4s, v7.4s\n"
100 "ldr q17, [x24, x19]\n"
101 "fmla v12.4s, v19.4s, v3.4s\n"
102 "ldr q19, [x23, x20]\n"
103 "fmla v11.4s, v20.4s, v7.4s\n"
104 "ldr q18, [%[inptr0], x21]\n"
105 "fmla v10.4s, v14.4s, v2.4s\n"
106 "ldr q16, [x26, %[input_col_stride1]]\n"
107 "fmla v12.4s, v17.4s, v0.4s\n"
108 "ldr q14, [x25, x19]\n"
109 "fmla v11.4s, v17.4s, v2.4s\n"
110 "ldr q15, [x24, x20]\n"
111 "fmla v10.4s, v13.4s, v4.4s\n"
112 "ldr q13, [x23, x21]\n"
113 "str q12, [%[outptr0]]\n"
114 "fmla v9.4s, v17.4s, v8.4s\n"
115 "fmla v11.4s, v19.4s, v4.4s\n"
116 "ldr q12, [x26, x19]\n"
117 "fmla v10.4s, v17.4s, v6.4s\n"
118 "ldr q20, [x25, x20]\n"
119 "fmla v9.4s, v14.4s, v5.4s\n"
120 "ldr q17, [x24, x21]\n"
121 "fmla v11.4s, v18.4s, v6.4s\n"
122 "ldr q19, [x26, x20]\n"
123 "fmla v10.4s, v16.4s, v1.4s\n"
124 "ldr q18, [x25, x21]\n"
125 "fmla v9.4s, v15.4s, v7.4s\n"
126 "ldr q16, [x26, x21]\n"
127 "fmla v11.4s, v15.4s, v1.4s\n"
128 "add %[wbptr], %[wbptr], #160\n"
129 "fmla v10.4s, v14.4s, v3.4s\n"
130 "ldr q14, [%[wbptr]]\n"
131 "fmla v9.4s, v12.4s, v2.4s\n"
132 "ldr q8, [%[wbptr], #16]\n"
133 "fmla v11.4s, v13.4s, v3.4s\n"
134 "ldr q7, [%[wbptr], #32]\n"
135 "fmla v10.4s, v12.4s, v0.4s\n"
136 "ldr q5, [%[wbptr], #64]\n"
137 "fmla v9.4s, v20.4s, v4.4s\n"
138 "ldr q2, [%[wbptr], #112]\n"
139 "fmla v11.4s, v17.4s, v0.4s\n"
140 "prfm pldl1keep, [%[wbptr], #64]\n"
141 "str q10, [x22]\n"
142 "mov v12.16b, v14.16b\n"
143 "fmla v9.4s, v17.4s, v6.4s\n"
144 "ldr q4, [%[wbptr], #80]\n"
145 "str q11, [%[outptr0], %[output_col_stride1]]\n"
146 "mov v10.16b, v14.16b\n"
147 "mov v11.16b, v14.16b\n"
148 "add %[inptr0], %[inptr0], #16\n"
149 "fmla v9.4s, v19.4s, v1.4s\n"
150 "ldr q6, [%[wbptr], #48]\n"
151 "ldr q15, [%[inptr0]]\n"
152 "add x23, x23, #16\n"
153 "fmla v12.4s, v15.4s, v8.4s\n"
154 "ldr q20, [x23]\n"
155 "fmla v9.4s, v18.4s, v3.4s\n"
156 "ldr q1, [%[wbptr], #128]\n"
157 "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
158 "add x24, x24, #16\n"
159 "fmla v12.4s, v20.4s, v5.4s\n"
160 "ldr q17, [x24]\n"
161 "fmla v9.4s, v16.4s, v0.4s\n"
162 "ldr q3, [%[wbptr], #96]\n"
163 "fmla v10.4s, v17.4s, v8.4s\n"
164 "ldr q16, [x23, %[input_col_stride1]]\n"
165 "fmla v12.4s, v13.4s, v7.4s\n"
166 "ldr q18, [%[inptr0], x19]\n"
167 "str q9, [x22, %[output_col_stride1]]\n"
168 "add x25, x25, #16\n"
169 "mov v9.16b, v14.16b\n"
170 "ldr q0, [%[wbptr], #144]\n"
171 "fmla v12.4s, v17.4s, v2.4s\n"
172 "ldr q14, [x25]\n"
173 "ldr q15, [x24, %[input_col_stride1]]\n"
174 "add x26, x26, #16\n"
175 "add %[outptr0], %[outptr0], #16\n"
176 "add x22, x22, #16\n"
177 "subs x28, x28, #1\n"
178 "fmla v12.4s, v16.4s, v4.4s\n"
179 "fmla v12.4s, v18.4s, v6.4s\n"
180 "bne 2b\n"
181 "3:\n"
182 "fmla v11.4s, v18.4s, v8.4s\n"
183 "ldr q19, [x23, x19]\n"
184 "fmla v10.4s, v14.4s, v5.4s\n"
185 "ldr q20, [%[inptr0], x20]\n"
186 "fmla v12.4s, v15.4s, v1.4s\n"
187 "ldr q14, [x26]\n"
188 "fmla v11.4s, v19.4s, v5.4s\n"
189 "ldr q13, [x25, %[input_col_stride1]]\n"
190 "fmla v10.4s, v15.4s, v7.4s\n"
191 "ldr q17, [x24, x19]\n"
192 "fmla v12.4s, v19.4s, v3.4s\n"
193 "ldr q19, [x23, x20]\n"
194 "fmla v11.4s, v20.4s, v7.4s\n"
195 "ldr q18, [%[inptr0], x21]\n"
196 "fmla v10.4s, v14.4s, v2.4s\n"
197 "ldr q16, [x26, %[input_col_stride1]]\n"
198 "fmla v12.4s, v17.4s, v0.4s\n"
199 "ldr q14, [x25, x19]\n"
200 "fmla v11.4s, v17.4s, v2.4s\n"
201 "ldr q15, [x24, x20]\n"
202 "fmla v10.4s, v13.4s, v4.4s\n"
203 "ldr q13, [x23, x21]\n"
204 "str q12, [%[outptr0]]\n"
205 "fmla v9.4s, v17.4s, v8.4s\n"
206 "fmla v11.4s, v19.4s, v4.4s\n"
207 "ldr q12, [x26, x19]\n"
208 "fmla v10.4s, v17.4s, v6.4s\n"
209 "ldr q20, [x25, x20]\n"
210 "fmla v9.4s, v14.4s, v5.4s\n"
211 "ldr q17, [x24, x21]\n"
212 "fmla v11.4s, v18.4s, v6.4s\n"
213 "ldr q19, [x26, x20]\n"
214 "fmla v10.4s, v16.4s, v1.4s\n"
215 "ldr q18, [x25, x21]\n"
216 "fmla v9.4s, v15.4s, v7.4s\n"
217 "ldr q16, [x26, x21]\n"
218 "fmla v11.4s, v15.4s, v1.4s\n"
219 "add %[wbptr], %[wbptr], #160\n"
220 "fmla v10.4s, v14.4s, v3.4s\n"
221 "prfm pldl1keep, [%[wbptr], #64]\n"
222 "fmla v9.4s, v12.4s, v2.4s\n"
223 "add %[inptr0], %[inptr0], #16\n"
224 "fmla v11.4s, v13.4s, v3.4s\n"
225 "add x23, x23, #16\n"
226 "fmla v10.4s, v12.4s, v0.4s\n"
227 "add x24, x24, #16\n"
228 "fmla v9.4s, v20.4s, v4.4s\n"
229 "add x25, x25, #16\n"
230 "fmla v11.4s, v17.4s, v0.4s\n"
231 "add x26, x26, #16\n"
232 "str q10, [x22]\n"
233 "fmla v9.4s, v17.4s, v6.4s\n"
234 "str q11, [%[outptr0], %[output_col_stride1]]\n"
235 "add %[outptr0], %[outptr0], #16\n"
236 "fmla v9.4s, v19.4s, v1.4s\n"
237 "fmla v9.4s, v18.4s, v3.4s\n"
238 "fmla v9.4s, v16.4s, v0.4s\n"
239 "str q9, [x22, %[output_col_stride1]]\n"
240 "add x22, x22, #16\n"
241 "4:\n"
242 "cbz x27, 7f\n"
243 "ldr s14, [%[wbptr]]\n"
244 "mov v12.16b, v14.16b\n"
245 "ldr s8, [%[wbptr], #4]\n"
246 "mov v10.16b, v14.16b\n"
247 "ldr s7, [%[wbptr], #8]\n"
248 "mov v11.16b, v14.16b\n"
249 "ldr s6, [%[wbptr], #12]\n"
250 "mov v9.16b, v14.16b\n"
251 "ldr s5, [%[wbptr], #16]\n"
252 "ldr s4, [%[wbptr], #20]\n"
253 "subs x27, x27, #1\n"
254 "ldr s3, [%[wbptr], #24]\n"
255 "ldr s2, [%[wbptr], #28]\n"
256 "ldr s1, [%[wbptr], #32]\n"
257 "ldr s0, [%[wbptr], #36]\n"
258 "ldr s15, [%[inptr0]]\n"
259 "ldr s20, [x23]\n"
260 "fmla v12.4s, v15.4s, v8.4s\n"
261 "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
262 "ldr s17, [x24]\n"
263 "ldr s16, [x23, %[input_col_stride1]]\n"
264 "fmla v10.4s, v17.4s, v8.4s\n"
265 "ldr s18, [%[inptr0], x19]\n"
266 "fmla v12.4s, v20.4s, v5.4s\n"
267 "ldr s14, [x25]\n"
268 "ldr s15, [x24, %[input_col_stride1]]\n"
269 "fmla v12.4s, v13.4s, v7.4s\n"
270 "fmla v12.4s, v17.4s, v2.4s\n"
271 "fmla v12.4s, v16.4s, v4.4s\n"
272 "fmla v12.4s, v18.4s, v6.4s\n"
273 "beq 6f\n"
274 "5:\n"
275 "fmla v11.4s, v18.4s, v8.4s\n"
276 "ldr s19, [x23, x19]\n"
277 "fmla v10.4s, v14.4s, v5.4s\n"
278 "ldr s20, [%[inptr0], x20]\n"
279 "fmla v12.4s, v15.4s, v1.4s\n"
280 "ldr s14, [x26]\n"
281 "fmla v11.4s, v19.4s, v5.4s\n"
282 "ldr s13, [x25, %[input_col_stride1]]\n"
283 "fmla v10.4s, v15.4s, v7.4s\n"
284 "ldr s17, [x24, x19]\n"
285 "fmla v12.4s, v19.4s, v3.4s\n"
286 "ldr s19, [x23, x20]\n"
287 "fmla v11.4s, v20.4s, v7.4s\n"
288 "ldr s18, [%[inptr0], x21]\n"
289 "fmla v10.4s, v14.4s, v2.4s\n"
290 "ldr s16, [x26, %[input_col_stride1]]\n"
291 "fmla v12.4s, v17.4s, v0.4s\n"
292 "ldr s14, [x25, x19]\n"
293 "fmla v11.4s, v17.4s, v2.4s\n"
294 "ldr s15, [x24, x20]\n"
295 "fmla v10.4s, v13.4s, v4.4s\n"
296 "ldr s13, [x23, x21]\n"
297 "str s12, [%[outptr0]]\n"
298 "fmla v9.4s, v17.4s, v8.4s\n"
299 "fmla v11.4s, v19.4s, v4.4s\n"
300 "ldr s12, [x26, x19]\n"
301 "fmla v10.4s, v17.4s, v6.4s\n"
302 "ldr s20, [x25, x20]\n"
303 "fmla v9.4s, v14.4s, v5.4s\n"
304 "ldr s17, [x24, x21]\n"
305 "fmla v11.4s, v18.4s, v6.4s\n"
306 "ldr s19, [x26, x20]\n"
307 "fmla v10.4s, v16.4s, v1.4s\n"
308 "ldr s18, [x25, x21]\n"
309 "fmla v9.4s, v15.4s, v7.4s\n"
310 "ldr s16, [x26, x21]\n"
311 "fmla v11.4s, v15.4s, v1.4s\n"
312 "add %[wbptr], %[wbptr], #40\n"
313 "fmla v10.4s, v14.4s, v3.4s\n"
314 "ldr s14, [%[wbptr]]\n"
315 "fmla v9.4s, v12.4s, v2.4s\n"
316 "ldr s8, [%[wbptr], #4]\n"
317 "fmla v11.4s, v13.4s, v3.4s\n"
318 "ldr s7, [%[wbptr], #8]\n"
319 "fmla v10.4s, v12.4s, v0.4s\n"
320 "ldr s5, [%[wbptr], #16]\n"
321 "fmla v9.4s, v20.4s, v4.4s\n"
322 "ldr s2, [%[wbptr], #28]\n"
323 "fmla v11.4s, v17.4s, v0.4s\n"
324 "prfm pldl1keep, [%[wbptr], #64]\n"
325 "str s10, [x22]\n"
326 "mov v12.16b, v14.16b\n"
327 "fmla v9.4s, v17.4s, v6.4s\n"
328 "ldr s4, [%[wbptr], #20]\n"
329 "str s11, [%[outptr0], %[output_col_stride1]]\n"
330 "mov v10.16b, v14.16b\n"
331 "mov v11.16b, v14.16b\n"
332 "add %[inptr0], %[inptr0], #4\n"
333 "fmla v9.4s, v19.4s, v1.4s\n"
334 "ldr s6, [%[wbptr], #12]\n"
335 "ldr s15, [%[inptr0]]\n"
336 "add x23, x23, #4\n"
337 "fmla v12.4s, v15.4s, v8.4s\n"
338 "ldr s20, [x23]\n"
339 "fmla v9.4s, v18.4s, v3.4s\n"
340 "ldr s1, [%[wbptr], #32]\n"
341 "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
342 "add x24, x24, #4\n"
343 "fmla v12.4s, v20.4s, v5.4s\n"
344 "ldr s17, [x24]\n"
345 "fmla v9.4s, v16.4s, v0.4s\n"
346 "ldr s3, [%[wbptr], #24]\n"
347 "fmla v10.4s, v17.4s, v8.4s\n"
348 "ldr s16, [x23, %[input_col_stride1]]\n"
349 "fmla v12.4s, v13.4s, v7.4s\n"
350 "ldr s18, [%[inptr0], x19]\n"
351 "str s9, [x22, %[output_col_stride1]]\n"
352 "add x25, x25, #4\n"
353 "mov v9.16b, v14.16b\n"
354 "ldr s0, [%[wbptr], #36]\n"
355 "fmla v12.4s, v17.4s, v2.4s\n"
356 "ldr s14, [x25]\n"
357 "ldr s15, [x24, %[input_col_stride1]]\n"
358 "add x26, x26, #4\n"
359 "add %[outptr0], %[outptr0], #4\n"
360 "add x22, x22, #4\n"
361 "subs x27, x27, #1\n"
362 "fmla v12.4s, v16.4s, v4.4s\n"
363 "fmla v12.4s, v18.4s, v6.4s\n"
364 "bne 5b\n"
365 "6:\n"
366 "fmla v11.4s, v18.4s, v8.4s\n"
367 "ldr s19, [x23, x19]\n"
368 "fmla v10.4s, v14.4s, v5.4s\n"
369 "ldr s20, [%[inptr0], x20]\n"
370 "fmla v12.4s, v15.4s, v1.4s\n"
371 "ldr s14, [x26]\n"
372 "fmla v11.4s, v19.4s, v5.4s\n"
373 "ldr s13, [x25, %[input_col_stride1]]\n"
374 "fmla v10.4s, v15.4s, v7.4s\n"
375 "ldr s17, [x24, x19]\n"
376 "fmla v12.4s, v19.4s, v3.4s\n"
377 "ldr s19, [x23, x20]\n"
378 "fmla v11.4s, v20.4s, v7.4s\n"
379 "ldr s18, [%[inptr0], x21]\n"
380 "fmla v10.4s, v14.4s, v2.4s\n"
381 "ldr s16, [x26, %[input_col_stride1]]\n"
382 "fmla v12.4s, v17.4s, v0.4s\n"
383 "ldr s14, [x25, x19]\n"
384 "fmla v11.4s, v17.4s, v2.4s\n"
385 "ldr s15, [x24, x20]\n"
386 "fmla v10.4s, v13.4s, v4.4s\n"
387 "ldr s13, [x23, x21]\n"
388 "str s12, [%[outptr0]]\n"
389 "fmla v9.4s, v17.4s, v8.4s\n"
390 "fmla v11.4s, v19.4s, v4.4s\n"
391 "ldr s12, [x26, x19]\n"
392 "fmla v10.4s, v17.4s, v6.4s\n"
393 "ldr s20, [x25, x20]\n"
394 "fmla v9.4s, v14.4s, v5.4s\n"
395 "ldr s17, [x24, x21]\n"
396 "fmla v11.4s, v18.4s, v6.4s\n"
397 "ldr s19, [x26, x20]\n"
398 "fmla v10.4s, v16.4s, v1.4s\n"
399 "ldr s18, [x25, x21]\n"
400 "fmla v9.4s, v15.4s, v7.4s\n"
401 "ldr s16, [x26, x21]\n"
402 "fmla v11.4s, v15.4s, v1.4s\n"
403 "add %[wbptr], %[wbptr], #40\n"
404 "fmla v10.4s, v14.4s, v3.4s\n"
405 "prfm pldl1keep, [%[wbptr], #64]\n"
406 "fmla v9.4s, v12.4s, v2.4s\n"
407 "add %[inptr0], %[inptr0], #4\n"
408 "fmla v11.4s, v13.4s, v3.4s\n"
409 "add x23, x23, #4\n"
410 "fmla v10.4s, v12.4s, v0.4s\n"
411 "add x24, x24, #4\n"
412 "fmla v9.4s, v20.4s, v4.4s\n"
413 "add x25, x25, #4\n"
414 "fmla v11.4s, v17.4s, v0.4s\n"
415 "add x26, x26, #4\n"
416 "str s10, [x22]\n"
417 "fmla v9.4s, v17.4s, v6.4s\n"
418 "str s11, [%[outptr0], %[output_col_stride1]]\n"
419 "add %[outptr0], %[outptr0], #4\n"
420 "fmla v9.4s, v19.4s, v1.4s\n"
421 "fmla v9.4s, v18.4s, v3.4s\n"
422 "fmla v9.4s, v16.4s, v0.4s\n"
423 "str s9, [x22, %[output_col_stride1]]\n"
424 "add x22, x22, #4\n"
425 "7:\n"
426 : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
427 : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
428 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
429 );
430}
Georgios Pinitas4074c992018-01-30 18:13:46 +0000431
432template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000433template <>
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100434void Conv::execute_tile<ActivationFunction::None>(
435 int n_channels,
436 const void *weight_bias_ptr,
437 const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
438 float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
439)
440{
441 __asm __volatile(
442 "mov x23, xzr\n"
443 "mov x24, xzr\n"
444 "and x25, %[n_channels], #3\n"
445 "lsr x26, %[n_channels], #2\n"
446 "cbz x26, 4f\n"
447 "1:\n"
448 "ldr q13, [%[wbptr]]\n"
449 "ldr x19, [%[inptrs], 0]\n"
450 "mov v10.16b, v13.16b\n"
451 "ldr q12, [%[wbptr], #16]\n"
452 "mov v8.16b, v13.16b\n"
453 "ldr q6, [%[wbptr], #32]\n"
454 "mov v9.16b, v13.16b\n"
455 "ldr q5, [%[wbptr], #48]\n"
456 "mov v7.16b, v13.16b\n"
457 "ldr q11, [%[wbptr], #64]\n"
458 "ldr q4, [%[wbptr], #80]\n"
459 "ldr x20, [%[inptrs], 40]\n"
460 "ldr q3, [%[wbptr], #96]\n"
461 "ldr x21, [%[inptrs], 80]\n"
462 "ldr q2, [%[wbptr], #112]\n"
463 "ldr x27, [%[inptrs], 120]\n"
464 "ldr q1, [%[wbptr], #128]\n"
465 "subs x26, x26, #1\n"
466 "ldr q0, [%[wbptr], #144]\n"
467 "ldr q14, [x19, x23]\n"
468 "fmla v10.4s, v14.4s, v12.4s\n"
469 "ldr q18, [x20, x23]\n"
470 "ldr q14, [x21, x23]\n"
471 "ldr x19, [%[inptrs], 8]\n"
472 "ldr q16, [x27, x23]\n"
473 "ldr x20, [%[inptrs], 48]\n"
474 "ldr q19, [x19, x23]\n"
475 "ldr x21, [%[inptrs], 88]\n"
476 "fmla v10.4s, v18.4s, v11.4s\n"
477 "ldr q15, [x20, x23]\n"
478 "ldr q18, [x21, x23]\n"
479 "ldr x19, [%[inptrs], 16]\n"
480 "ldr q13, [x19, x23]\n"
481 "fmla v10.4s, v19.4s, v6.4s\n"
482 "fmla v10.4s, v14.4s, v2.4s\n"
483 "beq 3f\n"
484 "2:\n"
485 "fmla v8.4s, v14.4s, v12.4s\n"
486 "ldr x20, [%[inptrs], 56]\n"
487 "fmla v10.4s, v15.4s, v4.4s\n"
488 "ldr x19, [%[inptrs], 24]\n"
489 "fmla v9.4s, v13.4s, v12.4s\n"
490 "ldr q14, [x20, x23]\n"
491 "ldr q17, [x19, x23]\n"
492 "ldr x22, [%[inptrs], 160]\n"
493 "fmla v8.4s, v16.4s, v11.4s\n"
494 "ldr x27, [%[inptrs], 128]\n"
495 "fmla v10.4s, v13.4s, v5.4s\n"
496 "ldr q15, [x22, x23]\n"
497 "fmla v9.4s, v14.4s, v11.4s\n"
498 "ldr q19, [x27, x23]\n"
499 "ldr x21, [%[inptrs], 96]\n"
500 "ldr x20, [%[inptrs], 64]\n"
501 "ldr x19, [%[inptrs], 32]\n"
502 "fmla v8.4s, v18.4s, v6.4s\n"
503 "ldr x22, [%[inptrs], 168]\n"
504 "fmla v10.4s, v18.4s, v1.4s\n"
505 "ldr q13, [x21, x23]\n"
506 "fmla v9.4s, v17.4s, v6.4s\n"
507 "ldr q18, [x20, x23]\n"
508 "fmla v7.4s, v13.4s, v12.4s\n"
509 "ldr q17, [x19, x23]\n"
510 "fmla v8.4s, v15.4s, v2.4s\n"
511 "ldr q15, [x22, x23]\n"
512 "fmla v10.4s, v14.4s, v3.4s\n"
513 "ldr x27, [%[inptrs], 136]\n"
514 "fmla v9.4s, v13.4s, v2.4s\n"
515 "ldr x21, [%[inptrs], 104]\n"
516 "ldr q16, [x27, x23]\n"
517 "ldr x20, [%[inptrs], 72]\n"
518 "fmla v8.4s, v19.4s, v4.4s\n"
519 "ldr q19, [x21, x23]\n"
520 "fmla v10.4s, v13.4s, v0.4s\n"
521 "ldr q12, [x20, x23]\n"
522 "fmla v9.4s, v18.4s, v4.4s\n"
523 "ldr x22, [%[inptrs], 176]\n"
524 "fmla v7.4s, v16.4s, v11.4s\n"
525 "ldr x27, [%[inptrs], 144]\n"
526 "fmla v8.4s, v13.4s, v5.4s\n"
527 "ldr q11, [x22, x23]\n"
528 "ldr q13, [x27, x23]\n"
529 "ldr x21, [%[inptrs], 112]\n"
530 "fmla v9.4s, v17.4s, v5.4s\n"
531 "ldr x22, [%[inptrs], 184]\n"
532 "fmla v7.4s, v19.4s, v6.4s\n"
533 "ldr q14, [x21, x23]\n"
534 "fmla v8.4s, v15.4s, v1.4s\n"
535 "ldr q17, [x22, x23]\n"
536 "ldr x27, [%[inptrs], 152]\n"
537 "ldr x22, [%[inptrs], 192]\n"
538 "ldr x21, [%[outptrs], 0]\n"
539 "fmla v9.4s, v19.4s, v1.4s\n"
540 "ldr x28, [%[outptrs], 16]\n"
541 "str q10, [x21, x24]\n"
542 "fmla v7.4s, v11.4s, v2.4s\n"
543 "fmla v8.4s, v16.4s, v3.4s\n"
544 "ldr q16, [x27, x23]\n"
545 "ldr q15, [x22, x23]\n"
546 "ldr x21, [%[outptrs], 8]\n"
547 "fmla v9.4s, v12.4s, v3.4s\n"
548 "add %[wbptr], %[wbptr], #160\n"
549 "fmla v7.4s, v13.4s, v4.4s\n"
550 "ldr q13, [%[wbptr]]\n"
551 "fmla v8.4s, v11.4s, v0.4s\n"
552 "ldr q12, [%[wbptr], #16]\n"
553 "mov v10.16b, v13.16b\n"
554 "ldr q6, [%[wbptr], #32]\n"
555 "fmla v9.4s, v14.4s, v0.4s\n"
556 "ldr q11, [%[wbptr], #64]\n"
557 "fmla v7.4s, v14.4s, v5.4s\n"
558 "ldr q4, [%[wbptr], #80]\n"
559 "str q8, [x28, x24]\n"
560 "add x23, x23, #16\n"
561 "mov v8.16b, v13.16b\n"
562 "ldr q2, [%[wbptr], #112]\n"
563 "str q9, [x21, x24]\n"
564 "ldr x28, [%[outptrs], 24]\n"
565 "fmla v7.4s, v17.4s, v1.4s\n"
566 "ldr q5, [%[wbptr], #48]\n"
567 "mov v9.16b, v13.16b\n"
568 "prfm pldl1keep, [%[wbptr], #64]\n"
569 "ldr x19, [%[inptrs], 0]\n"
570 "ldr x20, [%[inptrs], 40]\n"
571 "ldr x21, [%[inptrs], 80]\n"
572 "ldr x27, [%[inptrs], 120]\n"
573 "subs x26, x26, #1\n"
574 "fmla v7.4s, v16.4s, v3.4s\n"
575 "ldr q1, [%[wbptr], #128]\n"
576 "ldr q14, [x19, x23]\n"
577 "fmla v10.4s, v14.4s, v12.4s\n"
578 "ldr q18, [x20, x23]\n"
579 "ldr q14, [x21, x23]\n"
580 "ldr x19, [%[inptrs], 8]\n"
581 "fmla v7.4s, v15.4s, v0.4s\n"
582 "ldr q3, [%[wbptr], #96]\n"
583 "ldr q19, [x19, x23]\n"
584 "ldr x20, [%[inptrs], 48]\n"
585 "fmla v10.4s, v18.4s, v11.4s\n"
586 "ldr q16, [x27, x23]\n"
587 "ldr q15, [x20, x23]\n"
588 "ldr x19, [%[inptrs], 16]\n"
589 "str q7, [x28, x24]\n"
590 "ldr x21, [%[inptrs], 88]\n"
591 "mov v7.16b, v13.16b\n"
592 "ldr q0, [%[wbptr], #144]\n"
593 "fmla v10.4s, v19.4s, v6.4s\n"
594 "ldr q13, [x19, x23]\n"
595 "ldr q18, [x21, x23]\n"
596 "add x24, x24, #16\n"
597 "fmla v10.4s, v14.4s, v2.4s\n"
598 "bne 2b\n"
599 "3:\n"
600 "fmla v8.4s, v14.4s, v12.4s\n"
601 "ldr x20, [%[inptrs], 56]\n"
602 "fmla v10.4s, v15.4s, v4.4s\n"
603 "ldr x19, [%[inptrs], 24]\n"
604 "fmla v9.4s, v13.4s, v12.4s\n"
605 "ldr q14, [x20, x23]\n"
606 "ldr q17, [x19, x23]\n"
607 "ldr x22, [%[inptrs], 160]\n"
608 "fmla v8.4s, v16.4s, v11.4s\n"
609 "ldr x27, [%[inptrs], 128]\n"
610 "fmla v10.4s, v13.4s, v5.4s\n"
611 "ldr q15, [x22, x23]\n"
612 "fmla v9.4s, v14.4s, v11.4s\n"
613 "ldr q19, [x27, x23]\n"
614 "ldr x21, [%[inptrs], 96]\n"
615 "ldr x20, [%[inptrs], 64]\n"
616 "ldr x19, [%[inptrs], 32]\n"
617 "fmla v8.4s, v18.4s, v6.4s\n"
618 "ldr x22, [%[inptrs], 168]\n"
619 "fmla v10.4s, v18.4s, v1.4s\n"
620 "ldr q13, [x21, x23]\n"
621 "fmla v9.4s, v17.4s, v6.4s\n"
622 "ldr q18, [x20, x23]\n"
623 "fmla v7.4s, v13.4s, v12.4s\n"
624 "ldr q17, [x19, x23]\n"
625 "fmla v8.4s, v15.4s, v2.4s\n"
626 "ldr q15, [x22, x23]\n"
627 "fmla v10.4s, v14.4s, v3.4s\n"
628 "ldr x27, [%[inptrs], 136]\n"
629 "fmla v9.4s, v13.4s, v2.4s\n"
630 "ldr x21, [%[inptrs], 104]\n"
631 "ldr q16, [x27, x23]\n"
632 "ldr x20, [%[inptrs], 72]\n"
633 "fmla v8.4s, v19.4s, v4.4s\n"
634 "ldr q19, [x21, x23]\n"
635 "fmla v10.4s, v13.4s, v0.4s\n"
636 "ldr q12, [x20, x23]\n"
637 "fmla v9.4s, v18.4s, v4.4s\n"
638 "ldr x22, [%[inptrs], 176]\n"
639 "fmla v7.4s, v16.4s, v11.4s\n"
640 "ldr x27, [%[inptrs], 144]\n"
641 "fmla v8.4s, v13.4s, v5.4s\n"
642 "ldr q11, [x22, x23]\n"
643 "ldr q13, [x27, x23]\n"
644 "ldr x21, [%[inptrs], 112]\n"
645 "fmla v9.4s, v17.4s, v5.4s\n"
646 "ldr x22, [%[inptrs], 184]\n"
647 "fmla v7.4s, v19.4s, v6.4s\n"
648 "ldr q14, [x21, x23]\n"
649 "fmla v8.4s, v15.4s, v1.4s\n"
650 "ldr q17, [x22, x23]\n"
651 "ldr x27, [%[inptrs], 152]\n"
652 "ldr x22, [%[inptrs], 192]\n"
653 "ldr x21, [%[outptrs], 0]\n"
654 "fmla v9.4s, v19.4s, v1.4s\n"
655 "ldr x28, [%[outptrs], 16]\n"
656 "str q10, [x21, x24]\n"
657 "fmla v7.4s, v11.4s, v2.4s\n"
658 "fmla v8.4s, v16.4s, v3.4s\n"
659 "ldr q16, [x27, x23]\n"
660 "ldr q15, [x22, x23]\n"
661 "ldr x21, [%[outptrs], 8]\n"
662 "fmla v9.4s, v12.4s, v3.4s\n"
663 "add %[wbptr], %[wbptr], #160\n"
664 "fmla v7.4s, v13.4s, v4.4s\n"
665 "prfm pldl1keep, [%[wbptr], #64]\n"
666 "fmla v8.4s, v11.4s, v0.4s\n"
667 "add x23, x23, #16\n"
668 "fmla v9.4s, v14.4s, v0.4s\n"
669 "fmla v7.4s, v14.4s, v5.4s\n"
670 "str q8, [x28, x24]\n"
671 "ldr x28, [%[outptrs], 24]\n"
672 "str q9, [x21, x24]\n"
673 "fmla v7.4s, v17.4s, v1.4s\n"
674 "fmla v7.4s, v16.4s, v3.4s\n"
675 "fmla v7.4s, v15.4s, v0.4s\n"
676 "str q7, [x28, x24]\n"
677 "add x24, x24, #16\n"
678 "4:\n"
679 "cbz x25, 7f\n"
680 "ldr s13, [%[wbptr]]\n"
681 "mov v10.16b, v13.16b\n"
682 "ldr s12, [%[wbptr], #4]\n"
683 "mov v8.16b, v13.16b\n"
684 "ldr s6, [%[wbptr], #8]\n"
685 "mov v9.16b, v13.16b\n"
686 "ldr s5, [%[wbptr], #12]\n"
687 "mov v7.16b, v13.16b\n"
688 "ldr s11, [%[wbptr], #16]\n"
689 "ldr s4, [%[wbptr], #20]\n"
690 "ldr x19, [%[inptrs], 0]\n"
691 "ldr s3, [%[wbptr], #24]\n"
692 "ldr x20, [%[inptrs], 40]\n"
693 "ldr s2, [%[wbptr], #28]\n"
694 "ldr x21, [%[inptrs], 80]\n"
695 "ldr s1, [%[wbptr], #32]\n"
696 "ldr x27, [%[inptrs], 120]\n"
697 "ldr s0, [%[wbptr], #36]\n"
698 "subs x25, x25, #1\n"
699 "ldr s14, [x19, x23]\n"
700 "ldr s18, [x20, x23]\n"
701 "fmla v10.4s, v14.4s, v12.4s\n"
702 "ldr s14, [x21, x23]\n"
703 "ldr s16, [x27, x23]\n"
704 "ldr x19, [%[inptrs], 8]\n"
705 "ldr x20, [%[inptrs], 48]\n"
706 "ldr x21, [%[inptrs], 88]\n"
707 "ldr s19, [x19, x23]\n"
708 "fmla v10.4s, v18.4s, v11.4s\n"
709 "ldr s15, [x20, x23]\n"
710 "ldr s18, [x21, x23]\n"
711 "ldr x19, [%[inptrs], 16]\n"
712 "ldr s13, [x19, x23]\n"
713 "fmla v10.4s, v19.4s, v6.4s\n"
714 "fmla v10.4s, v14.4s, v2.4s\n"
715 "beq 6f\n"
716 "5:\n"
717 "fmla v8.4s, v14.4s, v12.4s\n"
718 "ldr x20, [%[inptrs], 56]\n"
719 "fmla v10.4s, v15.4s, v4.4s\n"
720 "ldr x19, [%[inptrs], 24]\n"
721 "fmla v9.4s, v13.4s, v12.4s\n"
722 "ldr s14, [x20, x23]\n"
723 "ldr s17, [x19, x23]\n"
724 "ldr x22, [%[inptrs], 160]\n"
725 "fmla v8.4s, v16.4s, v11.4s\n"
726 "ldr x27, [%[inptrs], 128]\n"
727 "fmla v10.4s, v13.4s, v5.4s\n"
728 "ldr s15, [x22, x23]\n"
729 "fmla v9.4s, v14.4s, v11.4s\n"
730 "ldr s19, [x27, x23]\n"
731 "ldr x21, [%[inptrs], 96]\n"
732 "ldr x20, [%[inptrs], 64]\n"
733 "ldr x19, [%[inptrs], 32]\n"
734 "fmla v8.4s, v18.4s, v6.4s\n"
735 "ldr x22, [%[inptrs], 168]\n"
736 "fmla v10.4s, v18.4s, v1.4s\n"
737 "ldr s13, [x21, x23]\n"
738 "fmla v9.4s, v17.4s, v6.4s\n"
739 "ldr s18, [x20, x23]\n"
740 "fmla v7.4s, v13.4s, v12.4s\n"
741 "ldr s17, [x19, x23]\n"
742 "fmla v8.4s, v15.4s, v2.4s\n"
743 "ldr s15, [x22, x23]\n"
744 "fmla v10.4s, v14.4s, v3.4s\n"
745 "ldr x27, [%[inptrs], 136]\n"
746 "fmla v9.4s, v13.4s, v2.4s\n"
747 "ldr x21, [%[inptrs], 104]\n"
748 "ldr s16, [x27, x23]\n"
749 "ldr x20, [%[inptrs], 72]\n"
750 "fmla v8.4s, v19.4s, v4.4s\n"
751 "ldr s19, [x21, x23]\n"
752 "fmla v10.4s, v13.4s, v0.4s\n"
753 "ldr s12, [x20, x23]\n"
754 "fmla v9.4s, v18.4s, v4.4s\n"
755 "ldr x22, [%[inptrs], 176]\n"
756 "fmla v7.4s, v16.4s, v11.4s\n"
757 "ldr x27, [%[inptrs], 144]\n"
758 "fmla v8.4s, v13.4s, v5.4s\n"
759 "ldr s11, [x22, x23]\n"
760 "ldr s13, [x27, x23]\n"
761 "ldr x21, [%[inptrs], 112]\n"
762 "fmla v9.4s, v17.4s, v5.4s\n"
763 "ldr x22, [%[inptrs], 184]\n"
764 "fmla v7.4s, v19.4s, v6.4s\n"
765 "ldr s14, [x21, x23]\n"
766 "fmla v8.4s, v15.4s, v1.4s\n"
767 "ldr s17, [x22, x23]\n"
768 "ldr x27, [%[inptrs], 152]\n"
769 "ldr x22, [%[inptrs], 192]\n"
770 "ldr x21, [%[outptrs], 0]\n"
771 "fmla v9.4s, v19.4s, v1.4s\n"
772 "ldr x28, [%[outptrs], 16]\n"
773 "str s10, [x21, x24]\n"
774 "fmla v7.4s, v11.4s, v2.4s\n"
775 "fmla v8.4s, v16.4s, v3.4s\n"
776 "ldr s16, [x27, x23]\n"
777 "ldr s15, [x22, x23]\n"
778 "ldr x21, [%[outptrs], 8]\n"
779 "fmla v9.4s, v12.4s, v3.4s\n"
780 "add %[wbptr], %[wbptr], #40\n"
781 "fmla v7.4s, v13.4s, v4.4s\n"
782 "ldr s13, [%[wbptr]]\n"
783 "fmla v8.4s, v11.4s, v0.4s\n"
784 "ldr s12, [%[wbptr], #4]\n"
785 "mov v10.16b, v13.16b\n"
786 "ldr s6, [%[wbptr], #8]\n"
787 "fmla v9.4s, v14.4s, v0.4s\n"
788 "ldr s11, [%[wbptr], #16]\n"
789 "fmla v7.4s, v14.4s, v5.4s\n"
790 "ldr s4, [%[wbptr], #20]\n"
791 "str s8, [x28, x24]\n"
792 "add x23, x23, #4\n"
793 "mov v8.16b, v13.16b\n"
794 "ldr s2, [%[wbptr], #28]\n"
795 "str s9, [x21, x24]\n"
796 "ldr x28, [%[outptrs], 24]\n"
797 "fmla v7.4s, v17.4s, v1.4s\n"
798 "ldr s5, [%[wbptr], #12]\n"
799 "mov v9.16b, v13.16b\n"
800 "prfm pldl1keep, [%[wbptr], #64]\n"
801 "ldr x19, [%[inptrs], 0]\n"
802 "ldr x20, [%[inptrs], 40]\n"
803 "ldr x21, [%[inptrs], 80]\n"
804 "ldr x27, [%[inptrs], 120]\n"
805 "subs x25, x25, #1\n"
806 "fmla v7.4s, v16.4s, v3.4s\n"
807 "ldr s1, [%[wbptr], #32]\n"
808 "ldr s14, [x19, x23]\n"
809 "fmla v10.4s, v14.4s, v12.4s\n"
810 "ldr s18, [x20, x23]\n"
811 "ldr s14, [x21, x23]\n"
812 "ldr x19, [%[inptrs], 8]\n"
813 "fmla v7.4s, v15.4s, v0.4s\n"
814 "ldr s3, [%[wbptr], #24]\n"
815 "ldr s19, [x19, x23]\n"
816 "ldr x20, [%[inptrs], 48]\n"
817 "fmla v10.4s, v18.4s, v11.4s\n"
818 "ldr s16, [x27, x23]\n"
819 "ldr s15, [x20, x23]\n"
820 "ldr x19, [%[inptrs], 16]\n"
821 "str s7, [x28, x24]\n"
822 "ldr x21, [%[inptrs], 88]\n"
823 "mov v7.16b, v13.16b\n"
824 "ldr s0, [%[wbptr], #36]\n"
825 "fmla v10.4s, v19.4s, v6.4s\n"
826 "ldr s13, [x19, x23]\n"
827 "ldr s18, [x21, x23]\n"
828 "add x24, x24, #4\n"
829 "fmla v10.4s, v14.4s, v2.4s\n"
830 "bne 5b\n"
831 "6:\n"
832 "fmla v8.4s, v14.4s, v12.4s\n"
833 "ldr x20, [%[inptrs], 56]\n"
834 "fmla v10.4s, v15.4s, v4.4s\n"
835 "ldr x19, [%[inptrs], 24]\n"
836 "fmla v9.4s, v13.4s, v12.4s\n"
837 "ldr s14, [x20, x23]\n"
838 "ldr s17, [x19, x23]\n"
839 "ldr x22, [%[inptrs], 160]\n"
840 "fmla v8.4s, v16.4s, v11.4s\n"
841 "ldr x27, [%[inptrs], 128]\n"
842 "fmla v10.4s, v13.4s, v5.4s\n"
843 "ldr s15, [x22, x23]\n"
844 "fmla v9.4s, v14.4s, v11.4s\n"
845 "ldr s19, [x27, x23]\n"
846 "ldr x21, [%[inptrs], 96]\n"
847 "ldr x20, [%[inptrs], 64]\n"
848 "ldr x19, [%[inptrs], 32]\n"
849 "fmla v8.4s, v18.4s, v6.4s\n"
850 "ldr x22, [%[inptrs], 168]\n"
851 "fmla v10.4s, v18.4s, v1.4s\n"
852 "ldr s13, [x21, x23]\n"
853 "fmla v9.4s, v17.4s, v6.4s\n"
854 "ldr s18, [x20, x23]\n"
855 "fmla v7.4s, v13.4s, v12.4s\n"
856 "ldr s17, [x19, x23]\n"
857 "fmla v8.4s, v15.4s, v2.4s\n"
858 "ldr s15, [x22, x23]\n"
859 "fmla v10.4s, v14.4s, v3.4s\n"
860 "ldr x27, [%[inptrs], 136]\n"
861 "fmla v9.4s, v13.4s, v2.4s\n"
862 "ldr x21, [%[inptrs], 104]\n"
863 "ldr s16, [x27, x23]\n"
864 "ldr x20, [%[inptrs], 72]\n"
865 "fmla v8.4s, v19.4s, v4.4s\n"
866 "ldr s19, [x21, x23]\n"
867 "fmla v10.4s, v13.4s, v0.4s\n"
868 "ldr s12, [x20, x23]\n"
869 "fmla v9.4s, v18.4s, v4.4s\n"
870 "ldr x22, [%[inptrs], 176]\n"
871 "fmla v7.4s, v16.4s, v11.4s\n"
872 "ldr x27, [%[inptrs], 144]\n"
873 "fmla v8.4s, v13.4s, v5.4s\n"
874 "ldr s11, [x22, x23]\n"
875 "ldr s13, [x27, x23]\n"
876 "ldr x21, [%[inptrs], 112]\n"
877 "fmla v9.4s, v17.4s, v5.4s\n"
878 "ldr x22, [%[inptrs], 184]\n"
879 "fmla v7.4s, v19.4s, v6.4s\n"
880 "ldr s14, [x21, x23]\n"
881 "fmla v8.4s, v15.4s, v1.4s\n"
882 "ldr s17, [x22, x23]\n"
883 "ldr x27, [%[inptrs], 152]\n"
884 "ldr x22, [%[inptrs], 192]\n"
885 "ldr x21, [%[outptrs], 0]\n"
886 "fmla v9.4s, v19.4s, v1.4s\n"
887 "ldr x28, [%[outptrs], 16]\n"
888 "str s10, [x21, x24]\n"
889 "fmla v7.4s, v11.4s, v2.4s\n"
890 "fmla v8.4s, v16.4s, v3.4s\n"
891 "ldr s16, [x27, x23]\n"
892 "ldr s15, [x22, x23]\n"
893 "ldr x21, [%[outptrs], 8]\n"
894 "fmla v9.4s, v12.4s, v3.4s\n"
895 "add %[wbptr], %[wbptr], #40\n"
896 "fmla v7.4s, v13.4s, v4.4s\n"
897 "prfm pldl1keep, [%[wbptr], #64]\n"
898 "fmla v8.4s, v11.4s, v0.4s\n"
899 "add x23, x23, #4\n"
900 "fmla v9.4s, v14.4s, v0.4s\n"
901 "fmla v7.4s, v14.4s, v5.4s\n"
902 "str s8, [x28, x24]\n"
903 "ldr x28, [%[outptrs], 24]\n"
904 "str s9, [x21, x24]\n"
905 "fmla v7.4s, v17.4s, v1.4s\n"
906 "fmla v7.4s, v16.4s, v3.4s\n"
907 "fmla v7.4s, v15.4s, v0.4s\n"
908 "str s7, [x28, x24]\n"
909 "add x24, x24, #4\n"
910 "7:\n"
911 : [wbptr] "+r" (weight_bias_ptr)
912 : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
913 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
914 );
915}
916
917template <>
918template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000919void Conv::execute_tile<ActivationFunction::ReLU>(
920 int n_channels,
921 const void *weight_bias_ptr,
922 const float *input,
923 const unsigned int input_row_stride,
924 const unsigned int input_col_stride,
925 float *output,
926 const unsigned int output_row_stride,
927 const unsigned int output_col_stride
928)
929{
930 __asm __volatile(
931 "add x24, %[inptr0], %[input_row_stride]\n"
932 "add x27, %[input_col_stride1], %[input_col_stride1]\n"
933 "add x19, %[outptr0], %[output_row_stride]\n"
934 "add x25, x24, %[input_row_stride]\n"
935 "add x23, x27, %[input_col_stride1]\n"
936 "and x20, %[n_channels], #3\n"
937 "add x28, x25, %[input_row_stride]\n"
938 "add x22, x23, %[input_col_stride1]\n"
939 "lsr x21, %[n_channels], #2\n"
940 "add x26, x28, %[input_row_stride]\n"
941 "cbz x21, 4f\n"
942 "1:\n"
943 "ldr q16, [%[wbptr]]\n"
944 "subs x21, x21, #1\n"
945 "mov v3.16b, v16.16b\n"
946 "ldr q4, [%[wbptr], #16]\n"
947 "mov v1.16b, v16.16b\n"
948 "ldr q5, [%[wbptr], #32]\n"
949 "mov v2.16b, v16.16b\n"
950 "ldr q12, [%[wbptr], #48]\n"
951 "mov v0.16b, v16.16b\n"
952 "ldr q11, [%[wbptr], #64]\n"
953 "ldr q10, [%[wbptr], #80]\n"
954 "ldr q6, [%[wbptr], #96]\n"
955 "ldr q9, [%[wbptr], #112]\n"
956 "ldr q8, [%[wbptr], #128]\n"
957 "ldr q7, [%[wbptr], #144]\n"
958 "ldr q21, [%[inptr0]]\n"
959 "fmla v3.4s, v21.4s, v4.4s\n"
960 "ldr q23, [x24]\n"
961 "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
962 "ldr q14, [x25]\n"
963 "fmla v1.4s, v14.4s, v4.4s\n"
964 "ldr q13, [x24, %[input_col_stride1]]\n"
965 "fmla v3.4s, v23.4s, v11.4s\n"
966 "ldr q18, [%[inptr0], x27]\n"
967 "ldr q15, [x28]\n"
968 "ldr q22, [x25, %[input_col_stride1]]\n"
969 "fmla v3.4s, v19.4s, v5.4s\n"
970 "fmla v3.4s, v14.4s, v9.4s\n"
971 "beq 3f\n"
972 "2:\n"
973 "fmla v3.4s, v13.4s, v10.4s\n"
974 "ldr q17, [x24, x27]\n"
975 "fmla v2.4s, v18.4s, v4.4s\n"
976 "ldr q20, [%[inptr0], x23]\n"
977 "fmla v1.4s, v15.4s, v11.4s\n"
978 "ldr q19, [x26]\n"
979 "fmla v3.4s, v18.4s, v12.4s\n"
980 "ldr q13, [x28, %[input_col_stride1]]\n"
981 "fmla v2.4s, v17.4s, v11.4s\n"
982 "ldr q14, [x25, x27]\n"
983 "fmla v1.4s, v22.4s, v5.4s\n"
984 "ldr q15, [x24, x23]\n"
985 "fmla v3.4s, v22.4s, v8.4s\n"
986 "ldr q16, [%[inptr0], x22]\n"
987 "fmla v2.4s, v20.4s, v5.4s\n"
988 "ldr q20, [x26, %[input_col_stride1]]\n"
989 "fmla v1.4s, v19.4s, v9.4s\n"
990 "ldr q19, [x28, x27]\n"
991 "fmla v3.4s, v17.4s, v6.4s\n"
992 "ldr q21, [x25, x23]\n"
993 "fmla v2.4s, v14.4s, v9.4s\n"
994 "ldr q22, [x24, x22]\n"
995 "fmla v1.4s, v13.4s, v10.4s\n"
996 "ldr q23, [x26, x27]\n"
997 "fmla v3.4s, v14.4s, v7.4s\n"
998 "ldr q18, [x28, x23]\n"
999 "fmla v0.4s, v14.4s, v4.4s\n"
1000 "ldr q13, [x25, x22]\n"
1001 "fmla v1.4s, v14.4s, v12.4s\n"
1002 "ldr q14, [x26, x23]\n"
1003 "fmla v2.4s, v15.4s, v10.4s\n"
1004 "ldr q17, [x28, x22]\n"
1005 "fmla v0.4s, v19.4s, v11.4s\n"
1006 "ldr q15, [x26, x22]\n"
1007 "fmla v1.4s, v20.4s, v8.4s\n"
1008 "add %[wbptr], %[wbptr], #160\n"
1009 "fmla v2.4s, v16.4s, v12.4s\n"
1010 "ldr q16, [%[wbptr]]\n"
1011 "fmla v0.4s, v21.4s, v5.4s\n"
1012 "ldr q4, [%[wbptr], #16]\n"
1013 "fmla v1.4s, v19.4s, v6.4s\n"
1014 "ldr q11, [%[wbptr], #64]\n"
1015 "fmla v2.4s, v21.4s, v8.4s\n"
1016 "prfm pldl1keep, [%[wbptr], #64]\n"
1017 "fmla v0.4s, v23.4s, v9.4s\n"
1018 "ldr q5, [%[wbptr], #32]\n"
1019 "fmla v1.4s, v23.4s, v7.4s\n"
1020 "add %[inptr0], %[inptr0], #16\n"
1021 "fmla v2.4s, v22.4s, v6.4s\n"
1022 "ldr q21, [%[inptr0]]\n"
1023 "fmla v0.4s, v18.4s, v10.4s\n"
1024 "ldr q9, [%[wbptr], #112]\n"
1025 "movi v20.16b, #0\n"
1026 "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
1027 "fmla v2.4s, v13.4s, v7.4s\n"
1028 "ldr q18, [%[inptr0], x27]\n"
1029 "fmla v0.4s, v13.4s, v12.4s\n"
1030 "ldr q10, [%[wbptr], #80]\n"
1031 "fmax v3.4s, v3.4s, v20.4s\n"
1032 "add x24, x24, #16\n"
1033 "fmax v2.4s, v2.4s, v20.4s\n"
1034 "ldr q23, [x24]\n"
1035 "str q3, [%[outptr0]]\n"
1036 "fmla v0.4s, v14.4s, v8.4s\n"
1037 "str q2, [%[outptr0], %[output_col_stride1]]\n"
1038 "fmax v1.4s, v1.4s, v20.4s\n"
1039 "mov v3.16b, v16.16b\n"
1040 "ldr q12, [%[wbptr], #48]\n"
1041 "str q1, [x19]\n"
1042 "fmla v0.4s, v17.4s, v6.4s\n"
1043 "mov v1.16b, v16.16b\n"
1044 "ldr q8, [%[wbptr], #128]\n"
1045 "mov v2.16b, v16.16b\n"
1046 "ldr q13, [x24, %[input_col_stride1]]\n"
1047 "fmla v0.4s, v15.4s, v7.4s\n"
1048 "ldr q6, [%[wbptr], #96]\n"
1049 "fmla v3.4s, v21.4s, v4.4s\n"
1050 "add x25, x25, #16\n"
1051 "ldr q14, [x25]\n"
1052 "add x28, x28, #16\n"
1053 "fmax v0.4s, v0.4s, v20.4s\n"
1054 "ldr q7, [%[wbptr], #144]\n"
1055 "fmla v3.4s, v23.4s, v11.4s\n"
1056 "ldr q15, [x28]\n"
1057 "str q0, [x19, %[output_col_stride1]]\n"
1058 "fmla v1.4s, v14.4s, v4.4s\n"
1059 "mov v0.16b, v16.16b\n"
1060 "ldr q22, [x25, %[input_col_stride1]]\n"
1061 "fmla v3.4s, v19.4s, v5.4s\n"
1062 "add x26, x26, #16\n"
1063 "add %[outptr0], %[outptr0], #16\n"
1064 "add x19, x19, #16\n"
1065 "subs x21, x21, #1\n"
1066 "fmla v3.4s, v14.4s, v9.4s\n"
1067 "bne 2b\n"
1068 "3:\n"
1069 "fmla v3.4s, v13.4s, v10.4s\n"
1070 "ldr q17, [x24, x27]\n"
1071 "fmla v2.4s, v18.4s, v4.4s\n"
1072 "ldr q20, [%[inptr0], x23]\n"
1073 "fmla v1.4s, v15.4s, v11.4s\n"
1074 "ldr q19, [x26]\n"
1075 "fmla v3.4s, v18.4s, v12.4s\n"
1076 "ldr q13, [x28, %[input_col_stride1]]\n"
1077 "fmla v2.4s, v17.4s, v11.4s\n"
1078 "ldr q14, [x25, x27]\n"
1079 "fmla v1.4s, v22.4s, v5.4s\n"
1080 "ldr q15, [x24, x23]\n"
1081 "fmla v3.4s, v22.4s, v8.4s\n"
1082 "ldr q16, [%[inptr0], x22]\n"
1083 "fmla v2.4s, v20.4s, v5.4s\n"
1084 "ldr q20, [x26, %[input_col_stride1]]\n"
1085 "fmla v1.4s, v19.4s, v9.4s\n"
1086 "ldr q19, [x28, x27]\n"
1087 "fmla v3.4s, v17.4s, v6.4s\n"
1088 "ldr q21, [x25, x23]\n"
1089 "fmla v2.4s, v14.4s, v9.4s\n"
1090 "ldr q22, [x24, x22]\n"
1091 "fmla v1.4s, v13.4s, v10.4s\n"
1092 "ldr q23, [x26, x27]\n"
1093 "fmla v3.4s, v14.4s, v7.4s\n"
1094 "ldr q18, [x28, x23]\n"
1095 "fmla v0.4s, v14.4s, v4.4s\n"
1096 "ldr q13, [x25, x22]\n"
1097 "fmla v1.4s, v14.4s, v12.4s\n"
1098 "ldr q14, [x26, x23]\n"
1099 "fmla v2.4s, v15.4s, v10.4s\n"
1100 "ldr q17, [x28, x22]\n"
1101 "fmla v0.4s, v19.4s, v11.4s\n"
1102 "ldr q15, [x26, x22]\n"
1103 "fmla v1.4s, v20.4s, v8.4s\n"
1104 "add %[wbptr], %[wbptr], #160\n"
1105 "fmla v2.4s, v16.4s, v12.4s\n"
1106 "prfm pldl1keep, [%[wbptr], #64]\n"
1107 "fmla v0.4s, v21.4s, v5.4s\n"
1108 "add %[inptr0], %[inptr0], #16\n"
1109 "fmla v1.4s, v19.4s, v6.4s\n"
1110 "add x24, x24, #16\n"
1111 "fmla v2.4s, v21.4s, v8.4s\n"
1112 "add x25, x25, #16\n"
1113 "fmla v0.4s, v23.4s, v9.4s\n"
1114 "add x28, x28, #16\n"
1115 "fmla v1.4s, v23.4s, v7.4s\n"
1116 "add x26, x26, #16\n"
1117 "fmla v2.4s, v22.4s, v6.4s\n"
1118 "movi v20.16b, #0\n"
1119 "fmla v0.4s, v18.4s, v10.4s\n"
1120 "fmax v3.4s, v3.4s, v20.4s\n"
1121 "fmla v2.4s, v13.4s, v7.4s\n"
1122 "fmax v1.4s, v1.4s, v20.4s\n"
1123 "str q3, [%[outptr0]]\n"
1124 "fmla v0.4s, v13.4s, v12.4s\n"
1125 "str q1, [x19]\n"
1126 "fmax v2.4s, v2.4s, v20.4s\n"
1127 "fmla v0.4s, v14.4s, v8.4s\n"
1128 "str q2, [%[outptr0], %[output_col_stride1]]\n"
1129 "add %[outptr0], %[outptr0], #16\n"
1130 "fmla v0.4s, v17.4s, v6.4s\n"
1131 "fmla v0.4s, v15.4s, v7.4s\n"
1132 "fmax v0.4s, v0.4s, v20.4s\n"
1133 "str q0, [x19, %[output_col_stride1]]\n"
1134 "add x19, x19, #16\n"
1135 "4:\n"
1136 "cbz x20, 7f\n"
1137 "ldr s16, [%[wbptr]]\n"
1138 "mov v3.16b, v16.16b\n"
1139 "ldr s4, [%[wbptr], #4]\n"
1140 "mov v1.16b, v16.16b\n"
1141 "ldr s5, [%[wbptr], #8]\n"
1142 "mov v2.16b, v16.16b\n"
1143 "ldr s12, [%[wbptr], #12]\n"
1144 "mov v0.16b, v16.16b\n"
1145 "ldr s11, [%[wbptr], #16]\n"
1146 "ldr s10, [%[wbptr], #20]\n"
1147 "subs x20, x20, #1\n"
1148 "ldr s6, [%[wbptr], #24]\n"
1149 "ldr s9, [%[wbptr], #28]\n"
1150 "ldr s8, [%[wbptr], #32]\n"
1151 "ldr s7, [%[wbptr], #36]\n"
1152 "ldr s21, [%[inptr0]]\n"
1153 "ldr s23, [x24]\n"
1154 "fmla v3.4s, v21.4s, v4.4s\n"
1155 "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
1156 "ldr s14, [x25]\n"
1157 "ldr s13, [x24, %[input_col_stride1]]\n"
1158 "fmla v1.4s, v14.4s, v4.4s\n"
1159 "ldr s18, [%[inptr0], x27]\n"
1160 "fmla v3.4s, v23.4s, v11.4s\n"
1161 "ldr s15, [x28]\n"
1162 "ldr s22, [x25, %[input_col_stride1]]\n"
1163 "fmla v3.4s, v19.4s, v5.4s\n"
1164 "fmla v3.4s, v14.4s, v9.4s\n"
1165 "beq 6f\n"
1166 "5:\n"
1167 "fmla v3.4s, v13.4s, v10.4s\n"
1168 "ldr s17, [x24, x27]\n"
1169 "fmla v2.4s, v18.4s, v4.4s\n"
1170 "ldr s20, [%[inptr0], x23]\n"
1171 "fmla v1.4s, v15.4s, v11.4s\n"
1172 "ldr s19, [x26]\n"
1173 "fmla v3.4s, v18.4s, v12.4s\n"
1174 "ldr s13, [x28, %[input_col_stride1]]\n"
1175 "fmla v2.4s, v17.4s, v11.4s\n"
1176 "ldr s14, [x25, x27]\n"
1177 "fmla v1.4s, v22.4s, v5.4s\n"
1178 "ldr s15, [x24, x23]\n"
1179 "fmla v3.4s, v22.4s, v8.4s\n"
1180 "ldr s16, [%[inptr0], x22]\n"
1181 "fmla v2.4s, v20.4s, v5.4s\n"
1182 "ldr s20, [x26, %[input_col_stride1]]\n"
1183 "fmla v1.4s, v19.4s, v9.4s\n"
1184 "ldr s19, [x28, x27]\n"
1185 "fmla v3.4s, v17.4s, v6.4s\n"
1186 "ldr s21, [x25, x23]\n"
1187 "fmla v2.4s, v14.4s, v9.4s\n"
1188 "ldr s22, [x24, x22]\n"
1189 "fmla v1.4s, v13.4s, v10.4s\n"
1190 "ldr s23, [x26, x27]\n"
1191 "fmla v3.4s, v14.4s, v7.4s\n"
1192 "ldr s18, [x28, x23]\n"
1193 "fmla v0.4s, v14.4s, v4.4s\n"
1194 "ldr s13, [x25, x22]\n"
1195 "fmla v1.4s, v14.4s, v12.4s\n"
1196 "ldr s14, [x26, x23]\n"
1197 "fmla v2.4s, v15.4s, v10.4s\n"
1198 "ldr s17, [x28, x22]\n"
1199 "fmla v0.4s, v19.4s, v11.4s\n"
1200 "ldr s15, [x26, x22]\n"
1201 "fmla v1.4s, v20.4s, v8.4s\n"
1202 "add %[wbptr], %[wbptr], #40\n"
1203 "fmla v2.4s, v16.4s, v12.4s\n"
1204 "ldr s16, [%[wbptr]]\n"
1205 "fmla v0.4s, v21.4s, v5.4s\n"
1206 "ldr s4, [%[wbptr], #4]\n"
1207 "fmla v1.4s, v19.4s, v6.4s\n"
1208 "ldr s11, [%[wbptr], #16]\n"
1209 "fmla v2.4s, v21.4s, v8.4s\n"
1210 "prfm pldl1keep, [%[wbptr], #64]\n"
1211 "fmla v0.4s, v23.4s, v9.4s\n"
1212 "ldr s5, [%[wbptr], #8]\n"
1213 "fmla v1.4s, v23.4s, v7.4s\n"
1214 "add %[inptr0], %[inptr0], #4\n"
1215 "fmla v2.4s, v22.4s, v6.4s\n"
1216 "ldr s21, [%[inptr0]]\n"
1217 "fmla v0.4s, v18.4s, v10.4s\n"
1218 "ldr s9, [%[wbptr], #28]\n"
1219 "movi v20.16b, #0\n"
1220 "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
1221 "fmla v2.4s, v13.4s, v7.4s\n"
1222 "ldr s18, [%[inptr0], x27]\n"
1223 "fmla v0.4s, v13.4s, v12.4s\n"
1224 "ldr s10, [%[wbptr], #20]\n"
1225 "fmax v3.4s, v3.4s, v20.4s\n"
1226 "add x24, x24, #4\n"
1227 "fmax v2.4s, v2.4s, v20.4s\n"
1228 "ldr s23, [x24]\n"
1229 "str s3, [%[outptr0]]\n"
1230 "fmla v0.4s, v14.4s, v8.4s\n"
1231 "str s2, [%[outptr0], %[output_col_stride1]]\n"
1232 "fmax v1.4s, v1.4s, v20.4s\n"
1233 "mov v3.16b, v16.16b\n"
1234 "ldr s12, [%[wbptr], #12]\n"
1235 "str s1, [x19]\n"
1236 "fmla v0.4s, v17.4s, v6.4s\n"
1237 "mov v1.16b, v16.16b\n"
1238 "ldr s8, [%[wbptr], #32]\n"
1239 "mov v2.16b, v16.16b\n"
1240 "ldr s13, [x24, %[input_col_stride1]]\n"
1241 "fmla v0.4s, v15.4s, v7.4s\n"
1242 "ldr s6, [%[wbptr], #24]\n"
1243 "fmla v3.4s, v21.4s, v4.4s\n"
1244 "add x25, x25, #4\n"
1245 "ldr s14, [x25]\n"
1246 "add x28, x28, #4\n"
1247 "fmax v0.4s, v0.4s, v20.4s\n"
1248 "ldr s7, [%[wbptr], #36]\n"
1249 "fmla v3.4s, v23.4s, v11.4s\n"
1250 "ldr s15, [x28]\n"
1251 "str s0, [x19, %[output_col_stride1]]\n"
1252 "fmla v1.4s, v14.4s, v4.4s\n"
1253 "mov v0.16b, v16.16b\n"
1254 "ldr s22, [x25, %[input_col_stride1]]\n"
1255 "fmla v3.4s, v19.4s, v5.4s\n"
1256 "add x26, x26, #4\n"
1257 "add %[outptr0], %[outptr0], #4\n"
1258 "add x19, x19, #4\n"
1259 "subs x20, x20, #1\n"
1260 "fmla v3.4s, v14.4s, v9.4s\n"
1261 "bne 5b\n"
1262 "6:\n"
1263 "fmla v3.4s, v13.4s, v10.4s\n"
1264 "ldr s17, [x24, x27]\n"
1265 "fmla v2.4s, v18.4s, v4.4s\n"
1266 "ldr s20, [%[inptr0], x23]\n"
1267 "fmla v1.4s, v15.4s, v11.4s\n"
1268 "ldr s19, [x26]\n"
1269 "fmla v3.4s, v18.4s, v12.4s\n"
1270 "ldr s13, [x28, %[input_col_stride1]]\n"
1271 "fmla v2.4s, v17.4s, v11.4s\n"
1272 "ldr s14, [x25, x27]\n"
1273 "fmla v1.4s, v22.4s, v5.4s\n"
1274 "ldr s15, [x24, x23]\n"
1275 "fmla v3.4s, v22.4s, v8.4s\n"
1276 "ldr s16, [%[inptr0], x22]\n"
1277 "fmla v2.4s, v20.4s, v5.4s\n"
1278 "ldr s20, [x26, %[input_col_stride1]]\n"
1279 "fmla v1.4s, v19.4s, v9.4s\n"
1280 "ldr s19, [x28, x27]\n"
1281 "fmla v3.4s, v17.4s, v6.4s\n"
1282 "ldr s21, [x25, x23]\n"
1283 "fmla v2.4s, v14.4s, v9.4s\n"
1284 "ldr s22, [x24, x22]\n"
1285 "fmla v1.4s, v13.4s, v10.4s\n"
1286 "ldr s23, [x26, x27]\n"
1287 "fmla v3.4s, v14.4s, v7.4s\n"
1288 "ldr s18, [x28, x23]\n"
1289 "fmla v0.4s, v14.4s, v4.4s\n"
1290 "ldr s13, [x25, x22]\n"
1291 "fmla v1.4s, v14.4s, v12.4s\n"
1292 "ldr s14, [x26, x23]\n"
1293 "fmla v2.4s, v15.4s, v10.4s\n"
1294 "ldr s17, [x28, x22]\n"
1295 "fmla v0.4s, v19.4s, v11.4s\n"
1296 "ldr s15, [x26, x22]\n"
1297 "fmla v1.4s, v20.4s, v8.4s\n"
1298 "add %[wbptr], %[wbptr], #40\n"
1299 "fmla v2.4s, v16.4s, v12.4s\n"
1300 "prfm pldl1keep, [%[wbptr], #64]\n"
1301 "fmla v0.4s, v21.4s, v5.4s\n"
1302 "add %[inptr0], %[inptr0], #4\n"
1303 "fmla v1.4s, v19.4s, v6.4s\n"
1304 "add x24, x24, #4\n"
1305 "fmla v2.4s, v21.4s, v8.4s\n"
1306 "add x25, x25, #4\n"
1307 "fmla v0.4s, v23.4s, v9.4s\n"
1308 "add x28, x28, #4\n"
1309 "fmla v1.4s, v23.4s, v7.4s\n"
1310 "add x26, x26, #4\n"
1311 "fmla v2.4s, v22.4s, v6.4s\n"
1312 "movi v20.16b, #0\n"
1313 "fmla v0.4s, v18.4s, v10.4s\n"
1314 "fmax v3.4s, v3.4s, v20.4s\n"
1315 "fmla v2.4s, v13.4s, v7.4s\n"
1316 "fmax v1.4s, v1.4s, v20.4s\n"
1317 "str s3, [%[outptr0]]\n"
1318 "fmla v0.4s, v13.4s, v12.4s\n"
1319 "str s1, [x19]\n"
1320 "fmax v2.4s, v2.4s, v20.4s\n"
1321 "fmla v0.4s, v14.4s, v8.4s\n"
1322 "str s2, [%[outptr0], %[output_col_stride1]]\n"
1323 "add %[outptr0], %[outptr0], #4\n"
1324 "fmla v0.4s, v17.4s, v6.4s\n"
1325 "fmla v0.4s, v15.4s, v7.4s\n"
1326 "fmax v0.4s, v0.4s, v20.4s\n"
1327 "str s0, [x19, %[output_col_stride1]]\n"
1328 "add x19, x19, #4\n"
1329 "7:\n"
1330 : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
1331 : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
1332 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
1333 );
1334}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00001335
1336template <>
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00001337template <>
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +01001338void Conv::execute_tile<ActivationFunction::ReLU>(
1339 int n_channels,
1340 const void *weight_bias_ptr,
1341 const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
1342 float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
1343)
1344{
1345 __asm __volatile(
1346 "mov x22, xzr\n"
1347 "mov x26, xzr\n"
1348 "and x23, %[n_channels], #3\n"
1349 "lsr x24, %[n_channels], #2\n"
1350 "cbz x24, 4f\n"
1351 "1:\n"
1352 "ldr q14, [%[wbptr]]\n"
1353 "ldr x19, [%[inptrs], 0]\n"
1354 "mov v3.16b, v14.16b\n"
1355 "ldr q13, [%[wbptr], #16]\n"
1356 "mov v1.16b, v14.16b\n"
1357 "ldr q11, [%[wbptr], #32]\n"
1358 "mov v2.16b, v14.16b\n"
1359 "ldr q4, [%[wbptr], #48]\n"
1360 "mov v0.16b, v14.16b\n"
1361 "ldr q12, [%[wbptr], #64]\n"
1362 "ldr q9, [%[wbptr], #80]\n"
1363 "ldr x20, [%[inptrs], 40]\n"
1364 "ldr q8, [%[wbptr], #96]\n"
1365 "ldr x21, [%[inptrs], 80]\n"
1366 "ldr q7, [%[wbptr], #112]\n"
1367 "ldr x25, [%[inptrs], 120]\n"
1368 "ldr q6, [%[wbptr], #128]\n"
1369 "subs x24, x24, #1\n"
1370 "ldr q5, [%[wbptr], #144]\n"
1371 "ldr q15, [x19, x22]\n"
1372 "fmla v3.4s, v15.4s, v13.4s\n"
1373 "ldr q17, [x20, x22]\n"
1374 "ldr q16, [x21, x22]\n"
1375 "ldr x19, [%[inptrs], 8]\n"
1376 "ldr q15, [x25, x22]\n"
1377 "ldr x20, [%[inptrs], 48]\n"
1378 "ldr q10, [x19, x22]\n"
1379 "ldr x21, [%[inptrs], 88]\n"
1380 "fmla v3.4s, v17.4s, v12.4s\n"
1381 "ldr q17, [x20, x22]\n"
1382 "ldr q14, [x21, x22]\n"
1383 "ldr x19, [%[inptrs], 16]\n"
1384 "ldr q18, [x19, x22]\n"
1385 "fmla v3.4s, v10.4s, v11.4s\n"
1386 "fmla v3.4s, v16.4s, v7.4s\n"
1387 "beq 3f\n"
1388 "2:\n"
1389 "fmla v1.4s, v16.4s, v13.4s\n"
1390 "ldr x20, [%[inptrs], 56]\n"
1391 "fmla v3.4s, v17.4s, v9.4s\n"
1392 "ldr x19, [%[inptrs], 24]\n"
1393 "fmla v2.4s, v18.4s, v13.4s\n"
1394 "ldr q16, [x20, x22]\n"
1395 "movi v10.16b, #0\n"
1396 "ldr q17, [x19, x22]\n"
1397 "fmla v1.4s, v15.4s, v12.4s\n"
1398 "ldr x27, [%[inptrs], 160]\n"
1399 "fmla v3.4s, v18.4s, v4.4s\n"
1400 "ldr x25, [%[inptrs], 128]\n"
1401 "fmla v2.4s, v16.4s, v12.4s\n"
1402 "ldr q18, [x27, x22]\n"
1403 "ldr q15, [x25, x22]\n"
1404 "ldr x21, [%[inptrs], 96]\n"
1405 "fmla v1.4s, v14.4s, v11.4s\n"
1406 "ldr x20, [%[inptrs], 64]\n"
1407 "fmla v3.4s, v14.4s, v6.4s\n"
1408 "ldr q14, [x21, x22]\n"
1409 "fmla v2.4s, v17.4s, v11.4s\n"
1410 "ldr q17, [x20, x22]\n"
1411 "fmla v0.4s, v14.4s, v13.4s\n"
1412 "ldr x19, [%[inptrs], 32]\n"
1413 "fmla v1.4s, v18.4s, v7.4s\n"
1414 "ldr x27, [%[inptrs], 168]\n"
1415 "fmla v3.4s, v16.4s, v8.4s\n"
1416 "ldr q18, [x19, x22]\n"
1417 "fmla v2.4s, v14.4s, v7.4s\n"
1418 "ldr q13, [x27, x22]\n"
1419 "ldr x25, [%[inptrs], 136]\n"
1420 "ldr x21, [%[inptrs], 104]\n"
1421 "ldr x20, [%[inptrs], 72]\n"
1422 "fmla v1.4s, v15.4s, v9.4s\n"
1423 "ldr x27, [%[inptrs], 176]\n"
1424 "fmla v3.4s, v14.4s, v5.4s\n"
1425 "ldr q16, [x25, x22]\n"
1426 "fmla v2.4s, v17.4s, v9.4s\n"
1427 "ldr q17, [x21, x22]\n"
1428 "fmla v0.4s, v16.4s, v12.4s\n"
1429 "ldr q12, [x20, x22]\n"
1430 "fmla v1.4s, v14.4s, v4.4s\n"
1431 "ldr q15, [x27, x22]\n"
1432 "fmax v3.4s, v3.4s, v10.4s\n"
1433 "ldr x25, [%[inptrs], 144]\n"
1434 "fmla v2.4s, v18.4s, v4.4s\n"
1435 "ldr x21, [%[inptrs], 112]\n"
1436 "fmla v0.4s, v17.4s, v11.4s\n"
1437 "ldr q14, [x25, x22]\n"
1438 "fmla v1.4s, v13.4s, v6.4s\n"
1439 "ldr q11, [x21, x22]\n"
1440 "ldr x27, [%[inptrs], 184]\n"
1441 "ldr x25, [%[inptrs], 152]\n"
1442 "ldr x21, [%[outptrs], 0]\n"
1443 "fmla v2.4s, v17.4s, v6.4s\n"
1444 "ldr x28, [%[outptrs], 16]\n"
1445 "str q3, [x21, x26]\n"
1446 "fmla v0.4s, v15.4s, v7.4s\n"
1447 "fmla v1.4s, v16.4s, v8.4s\n"
1448 "ldr q18, [x27, x22]\n"
1449 "ldr q17, [x25, x22]\n"
1450 "ldr x27, [%[inptrs], 192]\n"
1451 "fmla v2.4s, v12.4s, v8.4s\n"
1452 "ldr x21, [%[outptrs], 8]\n"
1453 "fmla v0.4s, v14.4s, v9.4s\n"
1454 "ldr q16, [x27, x22]\n"
1455 "fmla v1.4s, v15.4s, v5.4s\n"
1456 "add %[wbptr], %[wbptr], #160\n"
1457 "ldr q14, [%[wbptr]]\n"
1458 "add x22, x22, #16\n"
1459 "fmla v2.4s, v11.4s, v5.4s\n"
1460 "ldr q13, [%[wbptr], #16]\n"
1461 "fmla v0.4s, v11.4s, v4.4s\n"
1462 "ldr q11, [%[wbptr], #32]\n"
1463 "fmax v1.4s, v1.4s, v10.4s\n"
1464 "ldr q12, [%[wbptr], #64]\n"
1465 "mov v3.16b, v14.16b\n"
1466 "ldr q9, [%[wbptr], #80]\n"
1467 "fmax v2.4s, v2.4s, v10.4s\n"
1468 "ldr q7, [%[wbptr], #112]\n"
1469 "str q1, [x28, x26]\n"
1470 "fmla v0.4s, v18.4s, v6.4s\n"
1471 "mov v1.16b, v14.16b\n"
1472 "ldr q4, [%[wbptr], #48]\n"
1473 "str q2, [x21, x26]\n"
1474 "ldr x28, [%[outptrs], 24]\n"
1475 "mov v2.16b, v14.16b\n"
1476 "prfm pldl1keep, [%[wbptr], #64]\n"
1477 "fmla v0.4s, v17.4s, v8.4s\n"
1478 "ldr q6, [%[wbptr], #128]\n"
1479 "ldr x19, [%[inptrs], 0]\n"
1480 "ldr x20, [%[inptrs], 40]\n"
1481 "ldr x21, [%[inptrs], 80]\n"
1482 "ldr x25, [%[inptrs], 120]\n"
1483 "subs x24, x24, #1\n"
1484 "ldr q15, [x19, x22]\n"
1485 "fmla v0.4s, v16.4s, v5.4s\n"
1486 "ldr q8, [%[wbptr], #96]\n"
1487 "fmla v3.4s, v15.4s, v13.4s\n"
1488 "ldr q17, [x20, x22]\n"
1489 "ldr q16, [x21, x22]\n"
1490 "ldr x19, [%[inptrs], 8]\n"
1491 "ldr q15, [x25, x22]\n"
1492 "ldr x20, [%[inptrs], 48]\n"
1493 "fmax v0.4s, v0.4s, v10.4s\n"
1494 "ldr q5, [%[wbptr], #144]\n"
1495 "fmla v3.4s, v17.4s, v12.4s\n"
1496 "ldr q10, [x19, x22]\n"
1497 "ldr q17, [x20, x22]\n"
1498 "ldr x19, [%[inptrs], 16]\n"
1499 "str q0, [x28, x26]\n"
1500 "ldr x21, [%[inptrs], 88]\n"
1501 "mov v0.16b, v14.16b\n"
1502 "ldr q18, [x19, x22]\n"
1503 "fmla v3.4s, v10.4s, v11.4s\n"
1504 "ldr q14, [x21, x22]\n"
1505 "add x26, x26, #16\n"
1506 "fmla v3.4s, v16.4s, v7.4s\n"
1507 "bne 2b\n"
1508 "3:\n"
1509 "fmla v1.4s, v16.4s, v13.4s\n"
1510 "ldr x20, [%[inptrs], 56]\n"
1511 "fmla v3.4s, v17.4s, v9.4s\n"
1512 "ldr x19, [%[inptrs], 24]\n"
1513 "fmla v2.4s, v18.4s, v13.4s\n"
1514 "ldr q16, [x20, x22]\n"
1515 "movi v10.16b, #0\n"
1516 "ldr q17, [x19, x22]\n"
1517 "fmla v1.4s, v15.4s, v12.4s\n"
1518 "ldr x27, [%[inptrs], 160]\n"
1519 "fmla v3.4s, v18.4s, v4.4s\n"
1520 "ldr x25, [%[inptrs], 128]\n"
1521 "fmla v2.4s, v16.4s, v12.4s\n"
1522 "ldr q18, [x27, x22]\n"
1523 "ldr q15, [x25, x22]\n"
1524 "ldr x21, [%[inptrs], 96]\n"
1525 "fmla v1.4s, v14.4s, v11.4s\n"
1526 "ldr x20, [%[inptrs], 64]\n"
1527 "fmla v3.4s, v14.4s, v6.4s\n"
1528 "ldr q14, [x21, x22]\n"
1529 "fmla v2.4s, v17.4s, v11.4s\n"
1530 "ldr q17, [x20, x22]\n"
1531 "fmla v0.4s, v14.4s, v13.4s\n"
1532 "ldr x19, [%[inptrs], 32]\n"
1533 "fmla v1.4s, v18.4s, v7.4s\n"
1534 "ldr x27, [%[inptrs], 168]\n"
1535 "fmla v3.4s, v16.4s, v8.4s\n"
1536 "ldr q18, [x19, x22]\n"
1537 "fmla v2.4s, v14.4s, v7.4s\n"
1538 "ldr q13, [x27, x22]\n"
1539 "ldr x25, [%[inptrs], 136]\n"
1540 "ldr x21, [%[inptrs], 104]\n"
1541 "ldr x20, [%[inptrs], 72]\n"
1542 "fmla v1.4s, v15.4s, v9.4s\n"
1543 "ldr x27, [%[inptrs], 176]\n"
1544 "fmla v3.4s, v14.4s, v5.4s\n"
1545 "ldr q16, [x25, x22]\n"
1546 "fmla v2.4s, v17.4s, v9.4s\n"
1547 "ldr q17, [x21, x22]\n"
1548 "fmla v0.4s, v16.4s, v12.4s\n"
1549 "ldr q12, [x20, x22]\n"
1550 "fmla v1.4s, v14.4s, v4.4s\n"
1551 "ldr q15, [x27, x22]\n"
1552 "fmax v3.4s, v3.4s, v10.4s\n"
1553 "ldr x25, [%[inptrs], 144]\n"
1554 "fmla v2.4s, v18.4s, v4.4s\n"
1555 "ldr x21, [%[inptrs], 112]\n"
1556 "fmla v0.4s, v17.4s, v11.4s\n"
1557 "ldr q14, [x25, x22]\n"
1558 "fmla v1.4s, v13.4s, v6.4s\n"
1559 "ldr q11, [x21, x22]\n"
1560 "ldr x27, [%[inptrs], 184]\n"
1561 "ldr x25, [%[inptrs], 152]\n"
1562 "ldr x21, [%[outptrs], 0]\n"
1563 "fmla v2.4s, v17.4s, v6.4s\n"
1564 "ldr x28, [%[outptrs], 16]\n"
1565 "str q3, [x21, x26]\n"
1566 "fmla v0.4s, v15.4s, v7.4s\n"
1567 "fmla v1.4s, v16.4s, v8.4s\n"
1568 "ldr q18, [x27, x22]\n"
1569 "ldr q17, [x25, x22]\n"
1570 "ldr x27, [%[inptrs], 192]\n"
1571 "fmla v2.4s, v12.4s, v8.4s\n"
1572 "ldr x21, [%[outptrs], 8]\n"
1573 "fmla v0.4s, v14.4s, v9.4s\n"
1574 "ldr q16, [x27, x22]\n"
1575 "fmla v1.4s, v15.4s, v5.4s\n"
1576 "add %[wbptr], %[wbptr], #160\n"
1577 "prfm pldl1keep, [%[wbptr], #64]\n"
1578 "add x22, x22, #16\n"
1579 "fmla v2.4s, v11.4s, v5.4s\n"
1580 "fmla v0.4s, v11.4s, v4.4s\n"
1581 "fmax v1.4s, v1.4s, v10.4s\n"
1582 "fmax v2.4s, v2.4s, v10.4s\n"
1583 "str q1, [x28, x26]\n"
1584 "fmla v0.4s, v18.4s, v6.4s\n"
1585 "ldr x28, [%[outptrs], 24]\n"
1586 "str q2, [x21, x26]\n"
1587 "fmla v0.4s, v17.4s, v8.4s\n"
1588 "fmla v0.4s, v16.4s, v5.4s\n"
1589 "fmax v0.4s, v0.4s, v10.4s\n"
1590 "str q0, [x28, x26]\n"
1591 "add x26, x26, #16\n"
1592 "4:\n"
1593 "cbz x23, 7f\n"
1594 "ldr s14, [%[wbptr]]\n"
1595 "mov v3.16b, v14.16b\n"
1596 "ldr s13, [%[wbptr], #4]\n"
1597 "mov v1.16b, v14.16b\n"
1598 "ldr s11, [%[wbptr], #8]\n"
1599 "mov v2.16b, v14.16b\n"
1600 "ldr s4, [%[wbptr], #12]\n"
1601 "mov v0.16b, v14.16b\n"
1602 "ldr s12, [%[wbptr], #16]\n"
1603 "ldr s9, [%[wbptr], #20]\n"
1604 "ldr x19, [%[inptrs], 0]\n"
1605 "ldr s8, [%[wbptr], #24]\n"
1606 "ldr x20, [%[inptrs], 40]\n"
1607 "ldr s7, [%[wbptr], #28]\n"
1608 "ldr x21, [%[inptrs], 80]\n"
1609 "ldr s6, [%[wbptr], #32]\n"
1610 "ldr x25, [%[inptrs], 120]\n"
1611 "ldr s5, [%[wbptr], #36]\n"
1612 "subs x23, x23, #1\n"
1613 "ldr s15, [x19, x22]\n"
1614 "ldr s17, [x20, x22]\n"
1615 "fmla v3.4s, v15.4s, v13.4s\n"
1616 "ldr s16, [x21, x22]\n"
1617 "ldr s15, [x25, x22]\n"
1618 "ldr x19, [%[inptrs], 8]\n"
1619 "ldr x20, [%[inptrs], 48]\n"
1620 "ldr x21, [%[inptrs], 88]\n"
1621 "ldr s10, [x19, x22]\n"
1622 "fmla v3.4s, v17.4s, v12.4s\n"
1623 "ldr s17, [x20, x22]\n"
1624 "ldr s14, [x21, x22]\n"
1625 "ldr x19, [%[inptrs], 16]\n"
1626 "ldr s18, [x19, x22]\n"
1627 "fmla v3.4s, v10.4s, v11.4s\n"
1628 "fmla v3.4s, v16.4s, v7.4s\n"
1629 "beq 6f\n"
1630 "5:\n"
1631 "fmla v1.4s, v16.4s, v13.4s\n"
1632 "ldr x20, [%[inptrs], 56]\n"
1633 "fmla v3.4s, v17.4s, v9.4s\n"
1634 "ldr x19, [%[inptrs], 24]\n"
1635 "fmla v2.4s, v18.4s, v13.4s\n"
1636 "ldr s16, [x20, x22]\n"
1637 "movi v10.16b, #0\n"
1638 "ldr s17, [x19, x22]\n"
1639 "fmla v1.4s, v15.4s, v12.4s\n"
1640 "ldr x27, [%[inptrs], 160]\n"
1641 "fmla v3.4s, v18.4s, v4.4s\n"
1642 "ldr x25, [%[inptrs], 128]\n"
1643 "fmla v2.4s, v16.4s, v12.4s\n"
1644 "ldr s18, [x27, x22]\n"
1645 "ldr s15, [x25, x22]\n"
1646 "ldr x21, [%[inptrs], 96]\n"
1647 "fmla v1.4s, v14.4s, v11.4s\n"
1648 "ldr x20, [%[inptrs], 64]\n"
1649 "fmla v3.4s, v14.4s, v6.4s\n"
1650 "ldr s14, [x21, x22]\n"
1651 "fmla v2.4s, v17.4s, v11.4s\n"
1652 "ldr s17, [x20, x22]\n"
1653 "fmla v0.4s, v14.4s, v13.4s\n"
1654 "ldr x19, [%[inptrs], 32]\n"
1655 "fmla v1.4s, v18.4s, v7.4s\n"
1656 "ldr x27, [%[inptrs], 168]\n"
1657 "fmla v3.4s, v16.4s, v8.4s\n"
1658 "ldr s18, [x19, x22]\n"
1659 "fmla v2.4s, v14.4s, v7.4s\n"
1660 "ldr s13, [x27, x22]\n"
1661 "ldr x25, [%[inptrs], 136]\n"
1662 "ldr x21, [%[inptrs], 104]\n"
1663 "ldr x20, [%[inptrs], 72]\n"
1664 "fmla v1.4s, v15.4s, v9.4s\n"
1665 "ldr x27, [%[inptrs], 176]\n"
1666 "fmla v3.4s, v14.4s, v5.4s\n"
1667 "ldr s16, [x25, x22]\n"
1668 "fmla v2.4s, v17.4s, v9.4s\n"
1669 "ldr s17, [x21, x22]\n"
1670 "fmla v0.4s, v16.4s, v12.4s\n"
1671 "ldr s12, [x20, x22]\n"
1672 "fmla v1.4s, v14.4s, v4.4s\n"
1673 "ldr s15, [x27, x22]\n"
1674 "fmax v3.4s, v3.4s, v10.4s\n"
1675 "ldr x25, [%[inptrs], 144]\n"
1676 "fmla v2.4s, v18.4s, v4.4s\n"
1677 "ldr x21, [%[inptrs], 112]\n"
1678 "fmla v0.4s, v17.4s, v11.4s\n"
1679 "ldr s14, [x25, x22]\n"
1680 "fmla v1.4s, v13.4s, v6.4s\n"
1681 "ldr s11, [x21, x22]\n"
1682 "ldr x27, [%[inptrs], 184]\n"
1683 "ldr x25, [%[inptrs], 152]\n"
1684 "ldr x21, [%[outptrs], 0]\n"
1685 "fmla v2.4s, v17.4s, v6.4s\n"
1686 "ldr x28, [%[outptrs], 16]\n"
1687 "str s3, [x21, x26]\n"
1688 "fmla v0.4s, v15.4s, v7.4s\n"
1689 "fmla v1.4s, v16.4s, v8.4s\n"
1690 "ldr s18, [x27, x22]\n"
1691 "ldr s17, [x25, x22]\n"
1692 "ldr x27, [%[inptrs], 192]\n"
1693 "fmla v2.4s, v12.4s, v8.4s\n"
1694 "ldr x21, [%[outptrs], 8]\n"
1695 "fmla v0.4s, v14.4s, v9.4s\n"
1696 "ldr s16, [x27, x22]\n"
1697 "fmla v1.4s, v15.4s, v5.4s\n"
1698 "add %[wbptr], %[wbptr], #40\n"
1699 "ldr s14, [%[wbptr]]\n"
1700 "add x22, x22, #4\n"
1701 "fmla v2.4s, v11.4s, v5.4s\n"
1702 "ldr s13, [%[wbptr], #4]\n"
1703 "fmla v0.4s, v11.4s, v4.4s\n"
1704 "ldr s11, [%[wbptr], #8]\n"
1705 "fmax v1.4s, v1.4s, v10.4s\n"
1706 "ldr s12, [%[wbptr], #16]\n"
1707 "mov v3.16b, v14.16b\n"
1708 "ldr s9, [%[wbptr], #20]\n"
1709 "fmax v2.4s, v2.4s, v10.4s\n"
1710 "ldr s7, [%[wbptr], #28]\n"
1711 "str s1, [x28, x26]\n"
1712 "fmla v0.4s, v18.4s, v6.4s\n"
1713 "mov v1.16b, v14.16b\n"
1714 "ldr s4, [%[wbptr], #12]\n"
1715 "str s2, [x21, x26]\n"
1716 "ldr x28, [%[outptrs], 24]\n"
1717 "mov v2.16b, v14.16b\n"
1718 "prfm pldl1keep, [%[wbptr], #64]\n"
1719 "fmla v0.4s, v17.4s, v8.4s\n"
1720 "ldr s6, [%[wbptr], #32]\n"
1721 "ldr x19, [%[inptrs], 0]\n"
1722 "ldr x20, [%[inptrs], 40]\n"
1723 "ldr x21, [%[inptrs], 80]\n"
1724 "ldr x25, [%[inptrs], 120]\n"
1725 "subs x23, x23, #1\n"
1726 "ldr s15, [x19, x22]\n"
1727 "fmla v0.4s, v16.4s, v5.4s\n"
1728 "ldr s8, [%[wbptr], #24]\n"
1729 "fmla v3.4s, v15.4s, v13.4s\n"
1730 "ldr s17, [x20, x22]\n"
1731 "ldr s16, [x21, x22]\n"
1732 "ldr x19, [%[inptrs], 8]\n"
1733 "ldr s15, [x25, x22]\n"
1734 "ldr x20, [%[inptrs], 48]\n"
1735 "fmax v0.4s, v0.4s, v10.4s\n"
1736 "ldr s5, [%[wbptr], #36]\n"
1737 "fmla v3.4s, v17.4s, v12.4s\n"
1738 "ldr s10, [x19, x22]\n"
1739 "ldr s17, [x20, x22]\n"
1740 "ldr x19, [%[inptrs], 16]\n"
1741 "str s0, [x28, x26]\n"
1742 "ldr x21, [%[inptrs], 88]\n"
1743 "mov v0.16b, v14.16b\n"
1744 "ldr s18, [x19, x22]\n"
1745 "fmla v3.4s, v10.4s, v11.4s\n"
1746 "ldr s14, [x21, x22]\n"
1747 "add x26, x26, #4\n"
1748 "fmla v3.4s, v16.4s, v7.4s\n"
1749 "bne 5b\n"
1750 "6:\n"
1751 "fmla v1.4s, v16.4s, v13.4s\n"
1752 "ldr x20, [%[inptrs], 56]\n"
1753 "fmla v3.4s, v17.4s, v9.4s\n"
1754 "ldr x19, [%[inptrs], 24]\n"
1755 "fmla v2.4s, v18.4s, v13.4s\n"
1756 "ldr s16, [x20, x22]\n"
1757 "movi v10.16b, #0\n"
1758 "ldr s17, [x19, x22]\n"
1759 "fmla v1.4s, v15.4s, v12.4s\n"
1760 "ldr x27, [%[inptrs], 160]\n"
1761 "fmla v3.4s, v18.4s, v4.4s\n"
1762 "ldr x25, [%[inptrs], 128]\n"
1763 "fmla v2.4s, v16.4s, v12.4s\n"
1764 "ldr s18, [x27, x22]\n"
1765 "ldr s15, [x25, x22]\n"
1766 "ldr x21, [%[inptrs], 96]\n"
1767 "fmla v1.4s, v14.4s, v11.4s\n"
1768 "ldr x20, [%[inptrs], 64]\n"
1769 "fmla v3.4s, v14.4s, v6.4s\n"
1770 "ldr s14, [x21, x22]\n"
1771 "fmla v2.4s, v17.4s, v11.4s\n"
1772 "ldr s17, [x20, x22]\n"
1773 "fmla v0.4s, v14.4s, v13.4s\n"
1774 "ldr x19, [%[inptrs], 32]\n"
1775 "fmla v1.4s, v18.4s, v7.4s\n"
1776 "ldr x27, [%[inptrs], 168]\n"
1777 "fmla v3.4s, v16.4s, v8.4s\n"
1778 "ldr s18, [x19, x22]\n"
1779 "fmla v2.4s, v14.4s, v7.4s\n"
1780 "ldr s13, [x27, x22]\n"
1781 "ldr x25, [%[inptrs], 136]\n"
1782 "ldr x21, [%[inptrs], 104]\n"
1783 "ldr x20, [%[inptrs], 72]\n"
1784 "fmla v1.4s, v15.4s, v9.4s\n"
1785 "ldr x27, [%[inptrs], 176]\n"
1786 "fmla v3.4s, v14.4s, v5.4s\n"
1787 "ldr s16, [x25, x22]\n"
1788 "fmla v2.4s, v17.4s, v9.4s\n"
1789 "ldr s17, [x21, x22]\n"
1790 "fmla v0.4s, v16.4s, v12.4s\n"
1791 "ldr s12, [x20, x22]\n"
1792 "fmla v1.4s, v14.4s, v4.4s\n"
1793 "ldr s15, [x27, x22]\n"
1794 "fmax v3.4s, v3.4s, v10.4s\n"
1795 "ldr x25, [%[inptrs], 144]\n"
1796 "fmla v2.4s, v18.4s, v4.4s\n"
1797 "ldr x21, [%[inptrs], 112]\n"
1798 "fmla v0.4s, v17.4s, v11.4s\n"
1799 "ldr s14, [x25, x22]\n"
1800 "fmla v1.4s, v13.4s, v6.4s\n"
1801 "ldr s11, [x21, x22]\n"
1802 "ldr x27, [%[inptrs], 184]\n"
1803 "ldr x25, [%[inptrs], 152]\n"
1804 "ldr x21, [%[outptrs], 0]\n"
1805 "fmla v2.4s, v17.4s, v6.4s\n"
1806 "ldr x28, [%[outptrs], 16]\n"
1807 "str s3, [x21, x26]\n"
1808 "fmla v0.4s, v15.4s, v7.4s\n"
1809 "fmla v1.4s, v16.4s, v8.4s\n"
1810 "ldr s18, [x27, x22]\n"
1811 "ldr s17, [x25, x22]\n"
1812 "ldr x27, [%[inptrs], 192]\n"
1813 "fmla v2.4s, v12.4s, v8.4s\n"
1814 "ldr x21, [%[outptrs], 8]\n"
1815 "fmla v0.4s, v14.4s, v9.4s\n"
1816 "ldr s16, [x27, x22]\n"
1817 "fmla v1.4s, v15.4s, v5.4s\n"
1818 "add %[wbptr], %[wbptr], #40\n"
1819 "prfm pldl1keep, [%[wbptr], #64]\n"
1820 "add x22, x22, #4\n"
1821 "fmla v2.4s, v11.4s, v5.4s\n"
1822 "fmla v0.4s, v11.4s, v4.4s\n"
1823 "fmax v1.4s, v1.4s, v10.4s\n"
1824 "fmax v2.4s, v2.4s, v10.4s\n"
1825 "str s1, [x28, x26]\n"
1826 "fmla v0.4s, v18.4s, v6.4s\n"
1827 "ldr x28, [%[outptrs], 24]\n"
1828 "str s2, [x21, x26]\n"
1829 "fmla v0.4s, v17.4s, v8.4s\n"
1830 "fmla v0.4s, v16.4s, v5.4s\n"
1831 "fmax v0.4s, v0.4s, v10.4s\n"
1832 "str s0, [x28, x26]\n"
1833 "add x26, x26, #4\n"
1834 "7:\n"
1835 : [wbptr] "+r" (weight_bias_ptr)
1836 : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
1837 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
1838 );
1839}
1840
1841template <>
1842template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001843void Conv::execute_tile<ActivationFunction::ReLU6>(
1844 int n_channels,
1845 const void *weight_bias_ptr,
1846 const float *input,
1847 const unsigned int input_row_stride,
1848 const unsigned int input_col_stride,
1849 float *output,
1850 const unsigned int output_row_stride,
1851 const unsigned int output_col_stride
1852)
1853{
1854 __asm __volatile(
1855 "add x21, %[inptr0], %[input_row_stride]\n"
1856 "add x23, %[input_col_stride1], %[input_col_stride1]\n"
1857 "add x24, %[outptr0], %[output_row_stride]\n"
1858 "add x28, x21, %[input_row_stride]\n"
1859 "add x26, x23, %[input_col_stride1]\n"
1860 "and x19, %[n_channels], #3\n"
1861 "add x27, x28, %[input_row_stride]\n"
1862 "add x25, x26, %[input_col_stride1]\n"
1863 "lsr x20, %[n_channels], #2\n"
1864 "add x22, x27, %[input_row_stride]\n"
1865 "cbz x20, 4f\n"
1866 "1:\n"
1867 "ldr q14, [%[wbptr]]\n"
1868 "subs x20, x20, #1\n"
1869 "mov v5.16b, v14.16b\n"
1870 "ldr q0, [%[wbptr], #16]\n"
1871 "mov v11.16b, v14.16b\n"
1872 "ldr q1, [%[wbptr], #32]\n"
1873 "mov v12.16b, v14.16b\n"
1874 "ldr q2, [%[wbptr], #48]\n"
1875 "mov v10.16b, v14.16b\n"
1876 "ldr q6, [%[wbptr], #64]\n"
1877 "ldr q3, [%[wbptr], #80]\n"
1878 "ldr q7, [%[wbptr], #96]\n"
1879 "ldr q4, [%[wbptr], #112]\n"
1880 "ldr q8, [%[wbptr], #128]\n"
1881 "ldr q9, [%[wbptr], #144]\n"
1882 "ldr q19, [%[inptr0]]\n"
1883 "fmla v5.4s, v19.4s, v0.4s\n"
1884 "ldr q15, [x21]\n"
1885 "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
1886 "ldr q16, [x28]\n"
1887 "fmla v11.4s, v16.4s, v0.4s\n"
1888 "ldr q23, [x21, %[input_col_stride1]]\n"
1889 "fmla v5.4s, v15.4s, v6.4s\n"
1890 "ldr q18, [%[inptr0], x23]\n"
1891 "ldr q17, [x27]\n"
1892 "ldr q13, [x28, %[input_col_stride1]]\n"
1893 "fmla v5.4s, v21.4s, v1.4s\n"
1894 "fmla v5.4s, v16.4s, v4.4s\n"
1895 "beq 3f\n"
1896 "2:\n"
1897 "fmla v5.4s, v23.4s, v3.4s\n"
1898 "ldr q21, [x21, x23]\n"
1899 "fmla v12.4s, v18.4s, v0.4s\n"
1900 "ldr q20, [%[inptr0], x26]\n"
1901 "fmla v11.4s, v17.4s, v6.4s\n"
1902 "ldr q19, [x22]\n"
1903 "fmla v5.4s, v18.4s, v2.4s\n"
1904 "ldr q15, [x27, %[input_col_stride1]]\n"
1905 "fmla v12.4s, v21.4s, v6.4s\n"
1906 "ldr q16, [x28, x23]\n"
1907 "fmla v11.4s, v13.4s, v1.4s\n"
1908 "ldr q17, [x21, x26]\n"
1909 "fmla v5.4s, v13.4s, v8.4s\n"
1910 "ldr q14, [%[inptr0], x25]\n"
1911 "fmla v12.4s, v20.4s, v1.4s\n"
1912 "ldr q20, [x22, %[input_col_stride1]]\n"
1913 "fmla v11.4s, v19.4s, v4.4s\n"
1914 "ldr q19, [x27, x23]\n"
1915 "fmla v5.4s, v21.4s, v7.4s\n"
1916 "ldr q22, [x28, x26]\n"
1917 "fmla v12.4s, v16.4s, v4.4s\n"
1918 "ldr q21, [x21, x25]\n"
1919 "fmla v11.4s, v15.4s, v3.4s\n"
1920 "ldr q23, [x22, x23]\n"
1921 "fmla v5.4s, v16.4s, v9.4s\n"
1922 "ldr q18, [x27, x26]\n"
1923 "fmla v10.4s, v16.4s, v0.4s\n"
1924 "ldr q15, [x28, x25]\n"
1925 "fmla v11.4s, v16.4s, v2.4s\n"
1926 "ldr q16, [x22, x26]\n"
1927 "fmla v12.4s, v17.4s, v3.4s\n"
1928 "ldr q17, [x27, x25]\n"
1929 "fmla v10.4s, v19.4s, v6.4s\n"
1930 "ldr q13, [x22, x25]\n"
1931 "fmla v11.4s, v20.4s, v8.4s\n"
1932 "add %[wbptr], %[wbptr], #160\n"
1933 "fmla v12.4s, v14.4s, v2.4s\n"
1934 "ldr q14, [%[wbptr]]\n"
1935 "fmla v10.4s, v22.4s, v1.4s\n"
1936 "ldr q0, [%[wbptr], #16]\n"
1937 "fmla v11.4s, v19.4s, v7.4s\n"
1938 "ldr q6, [%[wbptr], #64]\n"
1939 "fmla v12.4s, v22.4s, v8.4s\n"
1940 "prfm pldl1keep, [%[wbptr], #64]\n"
1941 "fmla v10.4s, v23.4s, v4.4s\n"
1942 "ldr q1, [%[wbptr], #32]\n"
1943 "fmla v11.4s, v23.4s, v9.4s\n"
1944 "add %[inptr0], %[inptr0], #16\n"
1945 "fmla v12.4s, v21.4s, v7.4s\n"
1946 "ldr q19, [%[inptr0]]\n"
1947 "fmla v10.4s, v18.4s, v3.4s\n"
1948 "ldr q4, [%[wbptr], #112]\n"
1949 "movi v20.16b, #0\n"
1950 "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
1951 "fmla v12.4s, v15.4s, v9.4s\n"
1952 "ldr q18, [%[inptr0], x23]\n"
1953 "fmla v10.4s, v15.4s, v2.4s\n"
1954 "ldr q3, [%[wbptr], #80]\n"
1955 "fmov v22.4s, #6.0\n"
1956 "add x21, x21, #16\n"
1957 "fmax v5.4s, v5.4s, v20.4s\n"
1958 "ldr q15, [x21]\n"
1959 "fmla v10.4s, v16.4s, v8.4s\n"
1960 "ldr q2, [%[wbptr], #48]\n"
1961 "fmin v5.4s, v5.4s, v22.4s\n"
1962 "ldr q23, [x21, %[input_col_stride1]]\n"
1963 "fmax v12.4s, v12.4s, v20.4s\n"
1964 "add x28, x28, #16\n"
1965 "str q5, [%[outptr0]]\n"
1966 "fmla v10.4s, v17.4s, v7.4s\n"
1967 "fmin v12.4s, v12.4s, v22.4s\n"
1968 "ldr q8, [%[wbptr], #128]\n"
1969 "fmax v11.4s, v11.4s, v20.4s\n"
1970 "ldr q16, [x28]\n"
1971 "str q12, [%[outptr0], %[output_col_stride1]]\n"
1972 "fmla v10.4s, v13.4s, v9.4s\n"
1973 "fmin v11.4s, v11.4s, v22.4s\n"
1974 "ldr q7, [%[wbptr], #96]\n"
1975 "mov v5.16b, v14.16b\n"
1976 "ldr q13, [x28, %[input_col_stride1]]\n"
1977 "str q11, [x24]\n"
1978 "fmax v10.4s, v10.4s, v20.4s\n"
1979 "mov v11.16b, v14.16b\n"
1980 "ldr q9, [%[wbptr], #144]\n"
1981 "fmin v10.4s, v10.4s, v22.4s\n"
1982 "add x27, x27, #16\n"
1983 "mov v12.16b, v14.16b\n"
1984 "ldr q17, [x27]\n"
1985 "str q10, [x24, %[output_col_stride1]]\n"
1986 "fmla v5.4s, v19.4s, v0.4s\n"
1987 "mov v10.16b, v14.16b\n"
1988 "add x22, x22, #16\n"
1989 "fmla v11.4s, v16.4s, v0.4s\n"
1990 "add %[outptr0], %[outptr0], #16\n"
1991 "fmla v5.4s, v15.4s, v6.4s\n"
1992 "add x24, x24, #16\n"
1993 "subs x20, x20, #1\n"
1994 "fmla v5.4s, v21.4s, v1.4s\n"
1995 "fmla v5.4s, v16.4s, v4.4s\n"
1996 "bne 2b\n"
1997 "3:\n"
1998 "fmla v5.4s, v23.4s, v3.4s\n"
1999 "ldr q21, [x21, x23]\n"
2000 "fmla v12.4s, v18.4s, v0.4s\n"
2001 "ldr q20, [%[inptr0], x26]\n"
2002 "fmla v11.4s, v17.4s, v6.4s\n"
2003 "ldr q19, [x22]\n"
2004 "fmla v5.4s, v18.4s, v2.4s\n"
2005 "ldr q15, [x27, %[input_col_stride1]]\n"
2006 "fmla v12.4s, v21.4s, v6.4s\n"
2007 "ldr q16, [x28, x23]\n"
2008 "fmla v11.4s, v13.4s, v1.4s\n"
2009 "ldr q17, [x21, x26]\n"
2010 "fmla v5.4s, v13.4s, v8.4s\n"
2011 "ldr q14, [%[inptr0], x25]\n"
2012 "fmla v12.4s, v20.4s, v1.4s\n"
2013 "ldr q20, [x22, %[input_col_stride1]]\n"
2014 "fmla v11.4s, v19.4s, v4.4s\n"
2015 "ldr q19, [x27, x23]\n"
2016 "fmla v5.4s, v21.4s, v7.4s\n"
2017 "ldr q22, [x28, x26]\n"
2018 "fmla v12.4s, v16.4s, v4.4s\n"
2019 "ldr q21, [x21, x25]\n"
2020 "fmla v11.4s, v15.4s, v3.4s\n"
2021 "ldr q23, [x22, x23]\n"
2022 "fmla v5.4s, v16.4s, v9.4s\n"
2023 "ldr q18, [x27, x26]\n"
2024 "fmla v10.4s, v16.4s, v0.4s\n"
2025 "ldr q15, [x28, x25]\n"
2026 "fmla v11.4s, v16.4s, v2.4s\n"
2027 "ldr q16, [x22, x26]\n"
2028 "fmla v12.4s, v17.4s, v3.4s\n"
2029 "ldr q17, [x27, x25]\n"
2030 "fmla v10.4s, v19.4s, v6.4s\n"
2031 "ldr q13, [x22, x25]\n"
2032 "fmla v11.4s, v20.4s, v8.4s\n"
2033 "add %[wbptr], %[wbptr], #160\n"
2034 "fmla v12.4s, v14.4s, v2.4s\n"
2035 "prfm pldl1keep, [%[wbptr], #64]\n"
2036 "fmla v10.4s, v22.4s, v1.4s\n"
2037 "add %[inptr0], %[inptr0], #16\n"
2038 "fmla v11.4s, v19.4s, v7.4s\n"
2039 "add x21, x21, #16\n"
2040 "fmla v12.4s, v22.4s, v8.4s\n"
2041 "add x28, x28, #16\n"
2042 "fmla v10.4s, v23.4s, v4.4s\n"
2043 "add x27, x27, #16\n"
2044 "fmla v11.4s, v23.4s, v9.4s\n"
2045 "add x22, x22, #16\n"
2046 "fmla v12.4s, v21.4s, v7.4s\n"
2047 "movi v20.16b, #0\n"
2048 "fmla v10.4s, v18.4s, v3.4s\n"
2049 "fmov v22.4s, #6.0\n"
2050 "fmax v5.4s, v5.4s, v20.4s\n"
2051 "fmax v11.4s, v11.4s, v20.4s\n"
2052 "fmla v12.4s, v15.4s, v9.4s\n"
2053 "fmla v10.4s, v15.4s, v2.4s\n"
2054 "fmin v5.4s, v5.4s, v22.4s\n"
2055 "fmin v11.4s, v11.4s, v22.4s\n"
2056 "fmax v12.4s, v12.4s, v20.4s\n"
2057 "str q5, [%[outptr0]]\n"
2058 "str q11, [x24]\n"
2059 "fmla v10.4s, v16.4s, v8.4s\n"
2060 "fmin v12.4s, v12.4s, v22.4s\n"
2061 "str q12, [%[outptr0], %[output_col_stride1]]\n"
2062 "fmla v10.4s, v17.4s, v7.4s\n"
2063 "add %[outptr0], %[outptr0], #16\n"
2064 "fmla v10.4s, v13.4s, v9.4s\n"
2065 "fmax v10.4s, v10.4s, v20.4s\n"
2066 "fmin v10.4s, v10.4s, v22.4s\n"
2067 "str q10, [x24, %[output_col_stride1]]\n"
2068 "add x24, x24, #16\n"
2069 "4:\n"
2070 "cbz x19, 7f\n"
2071 "ldr s14, [%[wbptr]]\n"
2072 "mov v5.16b, v14.16b\n"
2073 "ldr s0, [%[wbptr], #4]\n"
2074 "mov v11.16b, v14.16b\n"
2075 "ldr s1, [%[wbptr], #8]\n"
2076 "mov v12.16b, v14.16b\n"
2077 "ldr s2, [%[wbptr], #12]\n"
2078 "mov v10.16b, v14.16b\n"
2079 "ldr s6, [%[wbptr], #16]\n"
2080 "ldr s3, [%[wbptr], #20]\n"
2081 "subs x19, x19, #1\n"
2082 "ldr s7, [%[wbptr], #24]\n"
2083 "ldr s4, [%[wbptr], #28]\n"
2084 "ldr s8, [%[wbptr], #32]\n"
2085 "ldr s9, [%[wbptr], #36]\n"
2086 "ldr s19, [%[inptr0]]\n"
2087 "ldr s15, [x21]\n"
2088 "fmla v5.4s, v19.4s, v0.4s\n"
2089 "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
2090 "ldr s16, [x28]\n"
2091 "ldr s23, [x21, %[input_col_stride1]]\n"
2092 "fmla v11.4s, v16.4s, v0.4s\n"
2093 "ldr s18, [%[inptr0], x23]\n"
2094 "fmla v5.4s, v15.4s, v6.4s\n"
2095 "ldr s17, [x27]\n"
2096 "ldr s13, [x28, %[input_col_stride1]]\n"
2097 "fmla v5.4s, v21.4s, v1.4s\n"
2098 "fmla v5.4s, v16.4s, v4.4s\n"
2099 "beq 6f\n"
2100 "5:\n"
2101 "fmla v5.4s, v23.4s, v3.4s\n"
2102 "ldr s21, [x21, x23]\n"
2103 "fmla v12.4s, v18.4s, v0.4s\n"
2104 "ldr s20, [%[inptr0], x26]\n"
2105 "fmla v11.4s, v17.4s, v6.4s\n"
2106 "ldr s19, [x22]\n"
2107 "fmla v5.4s, v18.4s, v2.4s\n"
2108 "ldr s15, [x27, %[input_col_stride1]]\n"
2109 "fmla v12.4s, v21.4s, v6.4s\n"
2110 "ldr s16, [x28, x23]\n"
2111 "fmla v11.4s, v13.4s, v1.4s\n"
2112 "ldr s17, [x21, x26]\n"
2113 "fmla v5.4s, v13.4s, v8.4s\n"
2114 "ldr s14, [%[inptr0], x25]\n"
2115 "fmla v12.4s, v20.4s, v1.4s\n"
2116 "ldr s20, [x22, %[input_col_stride1]]\n"
2117 "fmla v11.4s, v19.4s, v4.4s\n"
2118 "ldr s19, [x27, x23]\n"
2119 "fmla v5.4s, v21.4s, v7.4s\n"
2120 "ldr s22, [x28, x26]\n"
2121 "fmla v12.4s, v16.4s, v4.4s\n"
2122 "ldr s21, [x21, x25]\n"
2123 "fmla v11.4s, v15.4s, v3.4s\n"
2124 "ldr s23, [x22, x23]\n"
2125 "fmla v5.4s, v16.4s, v9.4s\n"
2126 "ldr s18, [x27, x26]\n"
2127 "fmla v10.4s, v16.4s, v0.4s\n"
2128 "ldr s15, [x28, x25]\n"
2129 "fmla v11.4s, v16.4s, v2.4s\n"
2130 "ldr s16, [x22, x26]\n"
2131 "fmla v12.4s, v17.4s, v3.4s\n"
2132 "ldr s17, [x27, x25]\n"
2133 "fmla v10.4s, v19.4s, v6.4s\n"
2134 "ldr s13, [x22, x25]\n"
2135 "fmla v11.4s, v20.4s, v8.4s\n"
2136 "add %[wbptr], %[wbptr], #40\n"
2137 "fmla v12.4s, v14.4s, v2.4s\n"
2138 "ldr s14, [%[wbptr]]\n"
2139 "fmla v10.4s, v22.4s, v1.4s\n"
2140 "ldr s0, [%[wbptr], #4]\n"
2141 "fmla v11.4s, v19.4s, v7.4s\n"
2142 "ldr s6, [%[wbptr], #16]\n"
2143 "fmla v12.4s, v22.4s, v8.4s\n"
2144 "prfm pldl1keep, [%[wbptr], #64]\n"
2145 "fmla v10.4s, v23.4s, v4.4s\n"
2146 "ldr s1, [%[wbptr], #8]\n"
2147 "fmla v11.4s, v23.4s, v9.4s\n"
2148 "add %[inptr0], %[inptr0], #4\n"
2149 "fmla v12.4s, v21.4s, v7.4s\n"
2150 "ldr s19, [%[inptr0]]\n"
2151 "fmla v10.4s, v18.4s, v3.4s\n"
2152 "ldr s4, [%[wbptr], #28]\n"
2153 "movi v20.16b, #0\n"
2154 "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
2155 "fmla v12.4s, v15.4s, v9.4s\n"
2156 "ldr s18, [%[inptr0], x23]\n"
2157 "fmla v10.4s, v15.4s, v2.4s\n"
2158 "ldr s3, [%[wbptr], #20]\n"
2159 "fmov v22.4s, #6.0\n"
2160 "add x21, x21, #4\n"
2161 "fmax v5.4s, v5.4s, v20.4s\n"
2162 "ldr s15, [x21]\n"
2163 "fmla v10.4s, v16.4s, v8.4s\n"
2164 "ldr s2, [%[wbptr], #12]\n"
2165 "fmin v5.4s, v5.4s, v22.4s\n"
2166 "ldr s23, [x21, %[input_col_stride1]]\n"
2167 "fmax v12.4s, v12.4s, v20.4s\n"
2168 "add x28, x28, #4\n"
2169 "str s5, [%[outptr0]]\n"
2170 "fmla v10.4s, v17.4s, v7.4s\n"
2171 "fmin v12.4s, v12.4s, v22.4s\n"
2172 "ldr s8, [%[wbptr], #32]\n"
2173 "fmax v11.4s, v11.4s, v20.4s\n"
2174 "ldr s16, [x28]\n"
2175 "str s12, [%[outptr0], %[output_col_stride1]]\n"
2176 "fmla v10.4s, v13.4s, v9.4s\n"
2177 "fmin v11.4s, v11.4s, v22.4s\n"
2178 "ldr s7, [%[wbptr], #24]\n"
2179 "mov v5.16b, v14.16b\n"
2180 "ldr s13, [x28, %[input_col_stride1]]\n"
2181 "str s11, [x24]\n"
2182 "fmax v10.4s, v10.4s, v20.4s\n"
2183 "mov v11.16b, v14.16b\n"
2184 "ldr s9, [%[wbptr], #36]\n"
2185 "fmin v10.4s, v10.4s, v22.4s\n"
2186 "add x27, x27, #4\n"
2187 "mov v12.16b, v14.16b\n"
2188 "ldr s17, [x27]\n"
2189 "str s10, [x24, %[output_col_stride1]]\n"
2190 "fmla v5.4s, v19.4s, v0.4s\n"
2191 "mov v10.16b, v14.16b\n"
2192 "add x22, x22, #4\n"
2193 "fmla v11.4s, v16.4s, v0.4s\n"
2194 "add %[outptr0], %[outptr0], #4\n"
2195 "fmla v5.4s, v15.4s, v6.4s\n"
2196 "add x24, x24, #4\n"
2197 "subs x19, x19, #1\n"
2198 "fmla v5.4s, v21.4s, v1.4s\n"
2199 "fmla v5.4s, v16.4s, v4.4s\n"
2200 "bne 5b\n"
2201 "6:\n"
2202 "fmla v5.4s, v23.4s, v3.4s\n"
2203 "ldr s21, [x21, x23]\n"
2204 "fmla v12.4s, v18.4s, v0.4s\n"
2205 "ldr s20, [%[inptr0], x26]\n"
2206 "fmla v11.4s, v17.4s, v6.4s\n"
2207 "ldr s19, [x22]\n"
2208 "fmla v5.4s, v18.4s, v2.4s\n"
2209 "ldr s15, [x27, %[input_col_stride1]]\n"
2210 "fmla v12.4s, v21.4s, v6.4s\n"
2211 "ldr s16, [x28, x23]\n"
2212 "fmla v11.4s, v13.4s, v1.4s\n"
2213 "ldr s17, [x21, x26]\n"
2214 "fmla v5.4s, v13.4s, v8.4s\n"
2215 "ldr s14, [%[inptr0], x25]\n"
2216 "fmla v12.4s, v20.4s, v1.4s\n"
2217 "ldr s20, [x22, %[input_col_stride1]]\n"
2218 "fmla v11.4s, v19.4s, v4.4s\n"
2219 "ldr s19, [x27, x23]\n"
2220 "fmla v5.4s, v21.4s, v7.4s\n"
2221 "ldr s22, [x28, x26]\n"
2222 "fmla v12.4s, v16.4s, v4.4s\n"
2223 "ldr s21, [x21, x25]\n"
2224 "fmla v11.4s, v15.4s, v3.4s\n"
2225 "ldr s23, [x22, x23]\n"
2226 "fmla v5.4s, v16.4s, v9.4s\n"
2227 "ldr s18, [x27, x26]\n"
2228 "fmla v10.4s, v16.4s, v0.4s\n"
2229 "ldr s15, [x28, x25]\n"
2230 "fmla v11.4s, v16.4s, v2.4s\n"
2231 "ldr s16, [x22, x26]\n"
2232 "fmla v12.4s, v17.4s, v3.4s\n"
2233 "ldr s17, [x27, x25]\n"
2234 "fmla v10.4s, v19.4s, v6.4s\n"
2235 "ldr s13, [x22, x25]\n"
2236 "fmla v11.4s, v20.4s, v8.4s\n"
2237 "add %[wbptr], %[wbptr], #40\n"
2238 "fmla v12.4s, v14.4s, v2.4s\n"
2239 "prfm pldl1keep, [%[wbptr], #64]\n"
2240 "fmla v10.4s, v22.4s, v1.4s\n"
2241 "add %[inptr0], %[inptr0], #4\n"
2242 "fmla v11.4s, v19.4s, v7.4s\n"
2243 "add x21, x21, #4\n"
2244 "fmla v12.4s, v22.4s, v8.4s\n"
2245 "add x28, x28, #4\n"
2246 "fmla v10.4s, v23.4s, v4.4s\n"
2247 "add x27, x27, #4\n"
2248 "fmla v11.4s, v23.4s, v9.4s\n"
2249 "add x22, x22, #4\n"
2250 "fmla v12.4s, v21.4s, v7.4s\n"
2251 "movi v20.16b, #0\n"
2252 "fmla v10.4s, v18.4s, v3.4s\n"
2253 "fmov v22.4s, #6.0\n"
2254 "fmax v5.4s, v5.4s, v20.4s\n"
2255 "fmax v11.4s, v11.4s, v20.4s\n"
2256 "fmla v12.4s, v15.4s, v9.4s\n"
2257 "fmla v10.4s, v15.4s, v2.4s\n"
2258 "fmin v5.4s, v5.4s, v22.4s\n"
2259 "fmin v11.4s, v11.4s, v22.4s\n"
2260 "fmax v12.4s, v12.4s, v20.4s\n"
2261 "str s5, [%[outptr0]]\n"
2262 "str s11, [x24]\n"
2263 "fmla v10.4s, v16.4s, v8.4s\n"
2264 "fmin v12.4s, v12.4s, v22.4s\n"
2265 "str s12, [%[outptr0], %[output_col_stride1]]\n"
2266 "fmla v10.4s, v17.4s, v7.4s\n"
2267 "add %[outptr0], %[outptr0], #4\n"
2268 "fmla v10.4s, v13.4s, v9.4s\n"
2269 "fmax v10.4s, v10.4s, v20.4s\n"
2270 "fmin v10.4s, v10.4s, v22.4s\n"
2271 "str s10, [x24, %[output_col_stride1]]\n"
2272 "add x24, x24, #4\n"
2273 "7:\n"
2274 : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
2275 : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
2276 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
2277 );
2278}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00002279
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +01002280template <>
2281template <>
2282void Conv::execute_tile<ActivationFunction::ReLU6>(
2283 int n_channels,
2284 const void *weight_bias_ptr,
2285 const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
2286 float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
2287)
2288{
2289 __asm __volatile(
2290 "mov x27, xzr\n"
2291 "mov x28, xzr\n"
2292 "and x26, %[n_channels], #3\n"
2293 "lsr x25, %[n_channels], #2\n"
2294 "cbz x25, 4f\n"
2295 "1:\n"
2296 "ldr q15, [%[wbptr]]\n"
2297 "ldr x21, [%[inptrs], 0]\n"
2298 "mov v8.16b, v15.16b\n"
2299 "ldr q14, [%[wbptr], #16]\n"
2300 "mov v3.16b, v15.16b\n"
2301 "ldr q10, [%[wbptr], #32]\n"
2302 "mov v2.16b, v15.16b\n"
2303 "ldr q7, [%[wbptr], #48]\n"
2304 "mov v4.16b, v15.16b\n"
2305 "ldr q13, [%[wbptr], #64]\n"
2306 "ldr q5, [%[wbptr], #80]\n"
2307 "ldr x22, [%[inptrs], 40]\n"
2308 "ldr q0, [%[wbptr], #96]\n"
2309 "ldr x20, [%[inptrs], 80]\n"
2310 "ldr q9, [%[wbptr], #112]\n"
2311 "ldr x23, [%[inptrs], 120]\n"
2312 "ldr q6, [%[wbptr], #128]\n"
2313 "subs x25, x25, #1\n"
2314 "ldr q1, [%[wbptr], #144]\n"
2315 "ldr q17, [x21, x27]\n"
2316 "fmla v8.4s, v17.4s, v14.4s\n"
2317 "ldr q18, [x22, x27]\n"
2318 "ldr q16, [x20, x27]\n"
2319 "ldr x21, [%[inptrs], 8]\n"
2320 "ldr q17, [x23, x27]\n"
2321 "ldr x22, [%[inptrs], 48]\n"
2322 "ldr q11, [x21, x27]\n"
2323 "ldr x20, [%[inptrs], 88]\n"
2324 "fmla v8.4s, v18.4s, v13.4s\n"
2325 "ldr q19, [x22, x27]\n"
2326 "ldr q15, [x20, x27]\n"
2327 "ldr x21, [%[inptrs], 16]\n"
2328 "ldr q12, [x21, x27]\n"
2329 "fmla v8.4s, v11.4s, v10.4s\n"
2330 "fmla v8.4s, v16.4s, v9.4s\n"
2331 "beq 3f\n"
2332 "2:\n"
2333 "fmla v3.4s, v16.4s, v14.4s\n"
2334 "ldr x22, [%[inptrs], 56]\n"
2335 "fmla v8.4s, v19.4s, v5.4s\n"
2336 "ldr x21, [%[inptrs], 24]\n"
2337 "fmla v2.4s, v12.4s, v14.4s\n"
2338 "ldr q16, [x22, x27]\n"
2339 "movi v11.16b, #0\n"
2340 "ldr q18, [x21, x27]\n"
2341 "fmla v3.4s, v17.4s, v13.4s\n"
2342 "ldr x20, [%[inptrs], 160]\n"
2343 "fmla v8.4s, v12.4s, v7.4s\n"
2344 "ldr x23, [%[inptrs], 128]\n"
2345 "fmla v2.4s, v16.4s, v13.4s\n"
2346 "ldr q19, [x20, x27]\n"
2347 "fmov v12.4s, #6.0\n"
2348 "ldr q17, [x23, x27]\n"
2349 "fmla v3.4s, v15.4s, v10.4s\n"
2350 "ldr x20, [%[inptrs], 96]\n"
2351 "fmla v8.4s, v15.4s, v6.4s\n"
2352 "ldr x22, [%[inptrs], 64]\n"
2353 "fmla v2.4s, v18.4s, v10.4s\n"
2354 "ldr q15, [x20, x27]\n"
2355 "fmla v4.4s, v15.4s, v14.4s\n"
2356 "ldr q18, [x22, x27]\n"
2357 "fmla v3.4s, v19.4s, v9.4s\n"
2358 "ldr x21, [%[inptrs], 32]\n"
2359 "fmla v8.4s, v16.4s, v0.4s\n"
2360 "ldr x20, [%[inptrs], 168]\n"
2361 "fmla v2.4s, v15.4s, v9.4s\n"
2362 "ldr q19, [x21, x27]\n"
2363 "ldr q16, [x20, x27]\n"
2364 "ldr x23, [%[inptrs], 136]\n"
2365 "fmla v3.4s, v17.4s, v5.4s\n"
2366 "ldr x20, [%[inptrs], 104]\n"
2367 "fmla v8.4s, v15.4s, v1.4s\n"
2368 "ldr q14, [x23, x27]\n"
2369 "fmla v2.4s, v18.4s, v5.4s\n"
2370 "ldr q17, [x20, x27]\n"
2371 "fmla v4.4s, v14.4s, v13.4s\n"
2372 "ldr x22, [%[inptrs], 72]\n"
2373 "fmla v3.4s, v15.4s, v7.4s\n"
2374 "ldr x20, [%[inptrs], 176]\n"
2375 "fmax v8.4s, v8.4s, v11.4s\n"
2376 "ldr q18, [x22, x27]\n"
2377 "fmla v2.4s, v19.4s, v7.4s\n"
2378 "ldr q13, [x20, x27]\n"
2379 "fmla v4.4s, v17.4s, v10.4s\n"
2380 "ldr x23, [%[inptrs], 144]\n"
2381 "fmla v3.4s, v16.4s, v6.4s\n"
2382 "ldr x20, [%[inptrs], 112]\n"
2383 "fmin v8.4s, v8.4s, v12.4s\n"
2384 "ldr q10, [x23, x27]\n"
2385 "fmla v2.4s, v17.4s, v6.4s\n"
2386 "ldr q15, [x20, x27]\n"
2387 "fmla v4.4s, v13.4s, v9.4s\n"
2388 "ldr x20, [%[inptrs], 184]\n"
2389 "fmla v3.4s, v14.4s, v0.4s\n"
2390 "ldr x23, [%[inptrs], 152]\n"
2391 "ldr q9, [x20, x27]\n"
2392 "ldr x22, [%[outptrs], 0]\n"
2393 "fmla v2.4s, v18.4s, v0.4s\n"
2394 "ldr q19, [x23, x27]\n"
2395 "str q8, [x22, x28]\n"
2396 "fmla v4.4s, v10.4s, v5.4s\n"
2397 "fmla v3.4s, v13.4s, v1.4s\n"
2398 "ldr x20, [%[inptrs], 192]\n"
2399 "ldr x22, [%[outptrs], 8]\n"
2400 "ldr x24, [%[outptrs], 16]\n"
2401 "add %[wbptr], %[wbptr], #160\n"
2402 "fmla v2.4s, v15.4s, v1.4s\n"
2403 "ldr q16, [x20, x27]\n"
2404 "fmla v4.4s, v15.4s, v7.4s\n"
2405 "ldr q15, [%[wbptr]]\n"
2406 "fmax v3.4s, v3.4s, v11.4s\n"
2407 "ldr q14, [%[wbptr], #16]\n"
2408 "mov v8.16b, v15.16b\n"
2409 "ldr q10, [%[wbptr], #32]\n"
2410 "fmax v2.4s, v2.4s, v11.4s\n"
2411 "ldr q13, [%[wbptr], #64]\n"
2412 "fmla v4.4s, v9.4s, v6.4s\n"
2413 "ldr q7, [%[wbptr], #48]\n"
2414 "fmin v3.4s, v3.4s, v12.4s\n"
2415 "ldr q5, [%[wbptr], #80]\n"
2416 "fmin v2.4s, v2.4s, v12.4s\n"
2417 "ldr q9, [%[wbptr], #112]\n"
2418 "prfm pldl1keep, [%[wbptr], #64]\n"
2419 "add x27, x27, #16\n"
2420 "str q3, [x24, x28]\n"
2421 "fmla v4.4s, v19.4s, v0.4s\n"
2422 "str q2, [x22, x28]\n"
2423 "mov v3.16b, v15.16b\n"
2424 "mov v2.16b, v15.16b\n"
2425 "ldr q6, [%[wbptr], #128]\n"
2426 "ldr x24, [%[outptrs], 24]\n"
2427 "ldr x21, [%[inptrs], 0]\n"
2428 "ldr x22, [%[inptrs], 40]\n"
2429 "fmla v4.4s, v16.4s, v1.4s\n"
2430 "ldr q0, [%[wbptr], #96]\n"
2431 "ldr q17, [x21, x27]\n"
2432 "ldr x20, [%[inptrs], 80]\n"
2433 "fmla v8.4s, v17.4s, v14.4s\n"
2434 "ldr q18, [x22, x27]\n"
2435 "ldr q16, [x20, x27]\n"
2436 "ldr x21, [%[inptrs], 8]\n"
2437 "fmax v4.4s, v4.4s, v11.4s\n"
2438 "ldr q1, [%[wbptr], #144]\n"
2439 "ldr q11, [x21, x27]\n"
2440 "ldr x22, [%[inptrs], 48]\n"
2441 "fmla v8.4s, v18.4s, v13.4s\n"
2442 "ldr x21, [%[inptrs], 16]\n"
2443 "fmin v4.4s, v4.4s, v12.4s\n"
2444 "ldr q19, [x22, x27]\n"
2445 "ldr q12, [x21, x27]\n"
2446 "ldr x23, [%[inptrs], 120]\n"
2447 "ldr x20, [%[inptrs], 88]\n"
2448 "subs x25, x25, #1\n"
2449 "str q4, [x24, x28]\n"
2450 "mov v4.16b, v15.16b\n"
2451 "ldr q17, [x23, x27]\n"
2452 "fmla v8.4s, v11.4s, v10.4s\n"
2453 "ldr q15, [x20, x27]\n"
2454 "add x28, x28, #16\n"
2455 "fmla v8.4s, v16.4s, v9.4s\n"
2456 "bne 2b\n"
2457 "3:\n"
2458 "fmla v3.4s, v16.4s, v14.4s\n"
2459 "ldr x22, [%[inptrs], 56]\n"
2460 "fmla v8.4s, v19.4s, v5.4s\n"
2461 "ldr x21, [%[inptrs], 24]\n"
2462 "fmla v2.4s, v12.4s, v14.4s\n"
2463 "ldr q16, [x22, x27]\n"
2464 "movi v11.16b, #0\n"
2465 "ldr q18, [x21, x27]\n"
2466 "fmla v3.4s, v17.4s, v13.4s\n"
2467 "ldr x20, [%[inptrs], 160]\n"
2468 "fmla v8.4s, v12.4s, v7.4s\n"
2469 "ldr x23, [%[inptrs], 128]\n"
2470 "fmla v2.4s, v16.4s, v13.4s\n"
2471 "ldr q19, [x20, x27]\n"
2472 "fmov v12.4s, #6.0\n"
2473 "ldr q17, [x23, x27]\n"
2474 "fmla v3.4s, v15.4s, v10.4s\n"
2475 "ldr x20, [%[inptrs], 96]\n"
2476 "fmla v8.4s, v15.4s, v6.4s\n"
2477 "ldr x22, [%[inptrs], 64]\n"
2478 "fmla v2.4s, v18.4s, v10.4s\n"
2479 "ldr q15, [x20, x27]\n"
2480 "fmla v4.4s, v15.4s, v14.4s\n"
2481 "ldr q18, [x22, x27]\n"
2482 "fmla v3.4s, v19.4s, v9.4s\n"
2483 "ldr x21, [%[inptrs], 32]\n"
2484 "fmla v8.4s, v16.4s, v0.4s\n"
2485 "ldr x20, [%[inptrs], 168]\n"
2486 "fmla v2.4s, v15.4s, v9.4s\n"
2487 "ldr q19, [x21, x27]\n"
2488 "ldr q16, [x20, x27]\n"
2489 "ldr x23, [%[inptrs], 136]\n"
2490 "fmla v3.4s, v17.4s, v5.4s\n"
2491 "ldr x20, [%[inptrs], 104]\n"
2492 "fmla v8.4s, v15.4s, v1.4s\n"
2493 "ldr q14, [x23, x27]\n"
2494 "fmla v2.4s, v18.4s, v5.4s\n"
2495 "ldr q17, [x20, x27]\n"
2496 "fmla v4.4s, v14.4s, v13.4s\n"
2497 "ldr x22, [%[inptrs], 72]\n"
2498 "fmla v3.4s, v15.4s, v7.4s\n"
2499 "ldr x20, [%[inptrs], 176]\n"
2500 "fmax v8.4s, v8.4s, v11.4s\n"
2501 "ldr q18, [x22, x27]\n"
2502 "fmla v2.4s, v19.4s, v7.4s\n"
2503 "ldr q13, [x20, x27]\n"
2504 "fmla v4.4s, v17.4s, v10.4s\n"
2505 "ldr x23, [%[inptrs], 144]\n"
2506 "fmla v3.4s, v16.4s, v6.4s\n"
2507 "ldr x20, [%[inptrs], 112]\n"
2508 "fmin v8.4s, v8.4s, v12.4s\n"
2509 "ldr q10, [x23, x27]\n"
2510 "fmla v2.4s, v17.4s, v6.4s\n"
2511 "ldr q15, [x20, x27]\n"
2512 "fmla v4.4s, v13.4s, v9.4s\n"
2513 "ldr x20, [%[inptrs], 184]\n"
2514 "fmla v3.4s, v14.4s, v0.4s\n"
2515 "ldr x23, [%[inptrs], 152]\n"
2516 "ldr q9, [x20, x27]\n"
2517 "ldr x22, [%[outptrs], 0]\n"
2518 "fmla v2.4s, v18.4s, v0.4s\n"
2519 "ldr q19, [x23, x27]\n"
2520 "str q8, [x22, x28]\n"
2521 "fmla v4.4s, v10.4s, v5.4s\n"
2522 "fmla v3.4s, v13.4s, v1.4s\n"
2523 "ldr x20, [%[inptrs], 192]\n"
2524 "ldr x22, [%[outptrs], 8]\n"
2525 "ldr x24, [%[outptrs], 16]\n"
2526 "add %[wbptr], %[wbptr], #160\n"
2527 "fmla v2.4s, v15.4s, v1.4s\n"
2528 "ldr q16, [x20, x27]\n"
2529 "fmla v4.4s, v15.4s, v7.4s\n"
2530 "prfm pldl1keep, [%[wbptr], #64]\n"
2531 "fmax v3.4s, v3.4s, v11.4s\n"
2532 "add x27, x27, #16\n"
2533 "fmax v2.4s, v2.4s, v11.4s\n"
2534 "fmla v4.4s, v9.4s, v6.4s\n"
2535 "fmin v3.4s, v3.4s, v12.4s\n"
2536 "fmin v2.4s, v2.4s, v12.4s\n"
2537 "str q3, [x24, x28]\n"
2538 "fmla v4.4s, v19.4s, v0.4s\n"
2539 "str q2, [x22, x28]\n"
2540 "ldr x24, [%[outptrs], 24]\n"
2541 "fmla v4.4s, v16.4s, v1.4s\n"
2542 "fmax v4.4s, v4.4s, v11.4s\n"
2543 "fmin v4.4s, v4.4s, v12.4s\n"
2544 "str q4, [x24, x28]\n"
2545 "add x28, x28, #16\n"
2546 "4:\n"
2547 "cbz x26, 7f\n"
2548 "ldr s15, [%[wbptr]]\n"
2549 "mov v8.16b, v15.16b\n"
2550 "ldr s14, [%[wbptr], #4]\n"
2551 "mov v3.16b, v15.16b\n"
2552 "ldr s10, [%[wbptr], #8]\n"
2553 "mov v2.16b, v15.16b\n"
2554 "ldr s7, [%[wbptr], #12]\n"
2555 "mov v4.16b, v15.16b\n"
2556 "ldr s13, [%[wbptr], #16]\n"
2557 "ldr s5, [%[wbptr], #20]\n"
2558 "ldr x21, [%[inptrs], 0]\n"
2559 "ldr s0, [%[wbptr], #24]\n"
2560 "ldr x22, [%[inptrs], 40]\n"
2561 "ldr s9, [%[wbptr], #28]\n"
2562 "ldr x20, [%[inptrs], 80]\n"
2563 "ldr s6, [%[wbptr], #32]\n"
2564 "ldr x23, [%[inptrs], 120]\n"
2565 "ldr s1, [%[wbptr], #36]\n"
2566 "subs x26, x26, #1\n"
2567 "ldr s17, [x21, x27]\n"
2568 "ldr s18, [x22, x27]\n"
2569 "fmla v8.4s, v17.4s, v14.4s\n"
2570 "ldr s16, [x20, x27]\n"
2571 "ldr s17, [x23, x27]\n"
2572 "ldr x21, [%[inptrs], 8]\n"
2573 "ldr x22, [%[inptrs], 48]\n"
2574 "ldr x20, [%[inptrs], 88]\n"
2575 "ldr s11, [x21, x27]\n"
2576 "fmla v8.4s, v18.4s, v13.4s\n"
2577 "ldr s19, [x22, x27]\n"
2578 "ldr s15, [x20, x27]\n"
2579 "ldr x21, [%[inptrs], 16]\n"
2580 "ldr s12, [x21, x27]\n"
2581 "fmla v8.4s, v11.4s, v10.4s\n"
2582 "fmla v8.4s, v16.4s, v9.4s\n"
2583 "beq 6f\n"
2584 "5:\n"
2585 "fmla v3.4s, v16.4s, v14.4s\n"
2586 "ldr x22, [%[inptrs], 56]\n"
2587 "fmla v8.4s, v19.4s, v5.4s\n"
2588 "ldr x21, [%[inptrs], 24]\n"
2589 "fmla v2.4s, v12.4s, v14.4s\n"
2590 "ldr s16, [x22, x27]\n"
2591 "movi v11.16b, #0\n"
2592 "ldr s18, [x21, x27]\n"
2593 "fmla v3.4s, v17.4s, v13.4s\n"
2594 "ldr x20, [%[inptrs], 160]\n"
2595 "fmla v8.4s, v12.4s, v7.4s\n"
2596 "ldr x23, [%[inptrs], 128]\n"
2597 "fmla v2.4s, v16.4s, v13.4s\n"
2598 "ldr s19, [x20, x27]\n"
2599 "fmov v12.4s, #6.0\n"
2600 "ldr s17, [x23, x27]\n"
2601 "fmla v3.4s, v15.4s, v10.4s\n"
2602 "ldr x20, [%[inptrs], 96]\n"
2603 "fmla v8.4s, v15.4s, v6.4s\n"
2604 "ldr x22, [%[inptrs], 64]\n"
2605 "fmla v2.4s, v18.4s, v10.4s\n"
2606 "ldr s15, [x20, x27]\n"
2607 "fmla v4.4s, v15.4s, v14.4s\n"
2608 "ldr s18, [x22, x27]\n"
2609 "fmla v3.4s, v19.4s, v9.4s\n"
2610 "ldr x21, [%[inptrs], 32]\n"
2611 "fmla v8.4s, v16.4s, v0.4s\n"
2612 "ldr x20, [%[inptrs], 168]\n"
2613 "fmla v2.4s, v15.4s, v9.4s\n"
2614 "ldr s19, [x21, x27]\n"
2615 "ldr s16, [x20, x27]\n"
2616 "ldr x23, [%[inptrs], 136]\n"
2617 "fmla v3.4s, v17.4s, v5.4s\n"
2618 "ldr x20, [%[inptrs], 104]\n"
2619 "fmla v8.4s, v15.4s, v1.4s\n"
2620 "ldr s14, [x23, x27]\n"
2621 "fmla v2.4s, v18.4s, v5.4s\n"
2622 "ldr s17, [x20, x27]\n"
2623 "fmla v4.4s, v14.4s, v13.4s\n"
2624 "ldr x22, [%[inptrs], 72]\n"
2625 "fmla v3.4s, v15.4s, v7.4s\n"
2626 "ldr x20, [%[inptrs], 176]\n"
2627 "fmax v8.4s, v8.4s, v11.4s\n"
2628 "ldr s18, [x22, x27]\n"
2629 "fmla v2.4s, v19.4s, v7.4s\n"
2630 "ldr s13, [x20, x27]\n"
2631 "fmla v4.4s, v17.4s, v10.4s\n"
2632 "ldr x23, [%[inptrs], 144]\n"
2633 "fmla v3.4s, v16.4s, v6.4s\n"
2634 "ldr x20, [%[inptrs], 112]\n"
2635 "fmin v8.4s, v8.4s, v12.4s\n"
2636 "ldr s10, [x23, x27]\n"
2637 "fmla v2.4s, v17.4s, v6.4s\n"
2638 "ldr s15, [x20, x27]\n"
2639 "fmla v4.4s, v13.4s, v9.4s\n"
2640 "ldr x20, [%[inptrs], 184]\n"
2641 "fmla v3.4s, v14.4s, v0.4s\n"
2642 "ldr x23, [%[inptrs], 152]\n"
2643 "ldr s9, [x20, x27]\n"
2644 "ldr x22, [%[outptrs], 0]\n"
2645 "fmla v2.4s, v18.4s, v0.4s\n"
2646 "ldr s19, [x23, x27]\n"
2647 "str s8, [x22, x28]\n"
2648 "fmla v4.4s, v10.4s, v5.4s\n"
2649 "fmla v3.4s, v13.4s, v1.4s\n"
2650 "ldr x20, [%[inptrs], 192]\n"
2651 "ldr x22, [%[outptrs], 8]\n"
2652 "ldr x24, [%[outptrs], 16]\n"
2653 "add %[wbptr], %[wbptr], #40\n"
2654 "fmla v2.4s, v15.4s, v1.4s\n"
2655 "ldr s16, [x20, x27]\n"
2656 "fmla v4.4s, v15.4s, v7.4s\n"
2657 "ldr s15, [%[wbptr]]\n"
2658 "fmax v3.4s, v3.4s, v11.4s\n"
2659 "ldr s14, [%[wbptr], #4]\n"
2660 "mov v8.16b, v15.16b\n"
2661 "ldr s10, [%[wbptr], #8]\n"
2662 "fmax v2.4s, v2.4s, v11.4s\n"
2663 "ldr s13, [%[wbptr], #16]\n"
2664 "fmla v4.4s, v9.4s, v6.4s\n"
2665 "ldr s7, [%[wbptr], #12]\n"
2666 "fmin v3.4s, v3.4s, v12.4s\n"
2667 "ldr s5, [%[wbptr], #20]\n"
2668 "fmin v2.4s, v2.4s, v12.4s\n"
2669 "ldr s9, [%[wbptr], #28]\n"
2670 "prfm pldl1keep, [%[wbptr], #64]\n"
2671 "add x27, x27, #4\n"
2672 "str s3, [x24, x28]\n"
2673 "fmla v4.4s, v19.4s, v0.4s\n"
2674 "str s2, [x22, x28]\n"
2675 "mov v3.16b, v15.16b\n"
2676 "mov v2.16b, v15.16b\n"
2677 "ldr s6, [%[wbptr], #32]\n"
2678 "ldr x24, [%[outptrs], 24]\n"
2679 "ldr x21, [%[inptrs], 0]\n"
2680 "ldr x22, [%[inptrs], 40]\n"
2681 "fmla v4.4s, v16.4s, v1.4s\n"
2682 "ldr s0, [%[wbptr], #24]\n"
2683 "ldr s17, [x21, x27]\n"
2684 "ldr x20, [%[inptrs], 80]\n"
2685 "fmla v8.4s, v17.4s, v14.4s\n"
2686 "ldr s18, [x22, x27]\n"
2687 "ldr s16, [x20, x27]\n"
2688 "ldr x21, [%[inptrs], 8]\n"
2689 "fmax v4.4s, v4.4s, v11.4s\n"
2690 "ldr s1, [%[wbptr], #36]\n"
2691 "ldr s11, [x21, x27]\n"
2692 "ldr x22, [%[inptrs], 48]\n"
2693 "fmla v8.4s, v18.4s, v13.4s\n"
2694 "ldr x21, [%[inptrs], 16]\n"
2695 "fmin v4.4s, v4.4s, v12.4s\n"
2696 "ldr s19, [x22, x27]\n"
2697 "ldr s12, [x21, x27]\n"
2698 "ldr x23, [%[inptrs], 120]\n"
2699 "ldr x20, [%[inptrs], 88]\n"
2700 "subs x26, x26, #1\n"
2701 "str s4, [x24, x28]\n"
2702 "mov v4.16b, v15.16b\n"
2703 "ldr s17, [x23, x27]\n"
2704 "fmla v8.4s, v11.4s, v10.4s\n"
2705 "ldr s15, [x20, x27]\n"
2706 "add x28, x28, #4\n"
2707 "fmla v8.4s, v16.4s, v9.4s\n"
2708 "bne 5b\n"
2709 "6:\n"
2710 "fmla v3.4s, v16.4s, v14.4s\n"
2711 "ldr x22, [%[inptrs], 56]\n"
2712 "fmla v8.4s, v19.4s, v5.4s\n"
2713 "ldr x21, [%[inptrs], 24]\n"
2714 "fmla v2.4s, v12.4s, v14.4s\n"
2715 "ldr s16, [x22, x27]\n"
2716 "movi v11.16b, #0\n"
2717 "ldr s18, [x21, x27]\n"
2718 "fmla v3.4s, v17.4s, v13.4s\n"
2719 "ldr x20, [%[inptrs], 160]\n"
2720 "fmla v8.4s, v12.4s, v7.4s\n"
2721 "ldr x23, [%[inptrs], 128]\n"
2722 "fmla v2.4s, v16.4s, v13.4s\n"
2723 "ldr s19, [x20, x27]\n"
2724 "fmov v12.4s, #6.0\n"
2725 "ldr s17, [x23, x27]\n"
2726 "fmla v3.4s, v15.4s, v10.4s\n"
2727 "ldr x20, [%[inptrs], 96]\n"
2728 "fmla v8.4s, v15.4s, v6.4s\n"
2729 "ldr x22, [%[inptrs], 64]\n"
2730 "fmla v2.4s, v18.4s, v10.4s\n"
2731 "ldr s15, [x20, x27]\n"
2732 "fmla v4.4s, v15.4s, v14.4s\n"
2733 "ldr s18, [x22, x27]\n"
2734 "fmla v3.4s, v19.4s, v9.4s\n"
2735 "ldr x21, [%[inptrs], 32]\n"
2736 "fmla v8.4s, v16.4s, v0.4s\n"
2737 "ldr x20, [%[inptrs], 168]\n"
2738 "fmla v2.4s, v15.4s, v9.4s\n"
2739 "ldr s19, [x21, x27]\n"
2740 "ldr s16, [x20, x27]\n"
2741 "ldr x23, [%[inptrs], 136]\n"
2742 "fmla v3.4s, v17.4s, v5.4s\n"
2743 "ldr x20, [%[inptrs], 104]\n"
2744 "fmla v8.4s, v15.4s, v1.4s\n"
2745 "ldr s14, [x23, x27]\n"
2746 "fmla v2.4s, v18.4s, v5.4s\n"
2747 "ldr s17, [x20, x27]\n"
2748 "fmla v4.4s, v14.4s, v13.4s\n"
2749 "ldr x22, [%[inptrs], 72]\n"
2750 "fmla v3.4s, v15.4s, v7.4s\n"
2751 "ldr x20, [%[inptrs], 176]\n"
2752 "fmax v8.4s, v8.4s, v11.4s\n"
2753 "ldr s18, [x22, x27]\n"
2754 "fmla v2.4s, v19.4s, v7.4s\n"
2755 "ldr s13, [x20, x27]\n"
2756 "fmla v4.4s, v17.4s, v10.4s\n"
2757 "ldr x23, [%[inptrs], 144]\n"
2758 "fmla v3.4s, v16.4s, v6.4s\n"
2759 "ldr x20, [%[inptrs], 112]\n"
2760 "fmin v8.4s, v8.4s, v12.4s\n"
2761 "ldr s10, [x23, x27]\n"
2762 "fmla v2.4s, v17.4s, v6.4s\n"
2763 "ldr s15, [x20, x27]\n"
2764 "fmla v4.4s, v13.4s, v9.4s\n"
2765 "ldr x20, [%[inptrs], 184]\n"
2766 "fmla v3.4s, v14.4s, v0.4s\n"
2767 "ldr x23, [%[inptrs], 152]\n"
2768 "ldr s9, [x20, x27]\n"
2769 "ldr x22, [%[outptrs], 0]\n"
2770 "fmla v2.4s, v18.4s, v0.4s\n"
2771 "ldr s19, [x23, x27]\n"
2772 "str s8, [x22, x28]\n"
2773 "fmla v4.4s, v10.4s, v5.4s\n"
2774 "fmla v3.4s, v13.4s, v1.4s\n"
2775 "ldr x20, [%[inptrs], 192]\n"
2776 "ldr x22, [%[outptrs], 8]\n"
2777 "ldr x24, [%[outptrs], 16]\n"
2778 "add %[wbptr], %[wbptr], #40\n"
2779 "fmla v2.4s, v15.4s, v1.4s\n"
2780 "ldr s16, [x20, x27]\n"
2781 "fmla v4.4s, v15.4s, v7.4s\n"
2782 "prfm pldl1keep, [%[wbptr], #64]\n"
2783 "fmax v3.4s, v3.4s, v11.4s\n"
2784 "add x27, x27, #4\n"
2785 "fmax v2.4s, v2.4s, v11.4s\n"
2786 "fmla v4.4s, v9.4s, v6.4s\n"
2787 "fmin v3.4s, v3.4s, v12.4s\n"
2788 "fmin v2.4s, v2.4s, v12.4s\n"
2789 "str s3, [x24, x28]\n"
2790 "fmla v4.4s, v19.4s, v0.4s\n"
2791 "str s2, [x22, x28]\n"
2792 "ldr x24, [%[outptrs], 24]\n"
2793 "fmla v4.4s, v16.4s, v1.4s\n"
2794 "fmax v4.4s, v4.4s, v11.4s\n"
2795 "fmin v4.4s, v4.4s, v12.4s\n"
2796 "str s4, [x24, x28]\n"
2797 "add x28, x28, #4\n"
2798 "7:\n"
2799 : [wbptr] "+r" (weight_bias_ptr)
2800 : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
2801 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
2802 );
2803}
2804
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002805#endif // __aarch64__
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00002806
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002807template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00002808
Georgios Pinitas4074c992018-01-30 18:13:46 +00002809} // namespace depthwise