blob: 4ac6276123e686df4cac427831afe15d06db2828 [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas20c246a2018-09-12 16:45:53 +010024#include "impl_fp32_fp32.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000025
26namespace depthwise
27{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000028
29using namespace neon_convolution_kernels;
30using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
31
32#ifdef __aarch64__
33template <>
34template <>
35void Conv::execute_tile<ActivationFunction::None>(
36 int n_channels,
37 const void *weight_bias_ptr,
38 const float *input,
39 const unsigned int input_row_stride,
40 const unsigned int input_col_stride,
41 float *output,
42 const unsigned int output_row_stride,
43 const unsigned int output_col_stride
44)
45{
46 __asm __volatile(
47 "add x23, %[inptr0], %[input_row_stride]\n"
48 "add x19, %[input_col_stride1], %[input_col_stride1]\n"
49 "add x22, %[outptr0], %[output_row_stride]\n"
50 "add x24, x23, %[input_row_stride]\n"
51 "add x20, x19, %[input_col_stride1]\n"
52 "and x27, %[n_channels], #3\n"
53 "add x25, x24, %[input_row_stride]\n"
54 "add x21, x20, %[input_col_stride1]\n"
55 "lsr x28, %[n_channels], #2\n"
56 "add x26, x25, %[input_row_stride]\n"
57 "cbz x28, 4f\n"
58 "1:\n"
59 "ldr q14, [%[wbptr]]\n"
60 "subs x28, x28, #1\n"
61 "mov v12.16b, v14.16b\n"
62 "ldr q8, [%[wbptr], #16]\n"
63 "mov v10.16b, v14.16b\n"
64 "ldr q7, [%[wbptr], #32]\n"
65 "mov v11.16b, v14.16b\n"
66 "ldr q6, [%[wbptr], #48]\n"
67 "mov v9.16b, v14.16b\n"
68 "ldr q5, [%[wbptr], #64]\n"
69 "ldr q4, [%[wbptr], #80]\n"
70 "ldr q3, [%[wbptr], #96]\n"
71 "ldr q2, [%[wbptr], #112]\n"
72 "ldr q1, [%[wbptr], #128]\n"
73 "ldr q0, [%[wbptr], #144]\n"
74 "ldr q15, [%[inptr0]]\n"
75 "fmla v12.4s, v15.4s, v8.4s\n"
76 "ldr q20, [x23]\n"
77 "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
78 "ldr q17, [x24]\n"
79 "fmla v10.4s, v17.4s, v8.4s\n"
80 "ldr q16, [x23, %[input_col_stride1]]\n"
81 "fmla v12.4s, v20.4s, v5.4s\n"
82 "ldr q18, [%[inptr0], x19]\n"
83 "ldr q14, [x25]\n"
84 "ldr q15, [x24, %[input_col_stride1]]\n"
85 "fmla v12.4s, v13.4s, v7.4s\n"
86 "fmla v12.4s, v17.4s, v2.4s\n"
87 "fmla v12.4s, v16.4s, v4.4s\n"
88 "fmla v12.4s, v18.4s, v6.4s\n"
89 "beq 3f\n"
90 "2:\n"
91 "fmla v11.4s, v18.4s, v8.4s\n"
92 "ldr q19, [x23, x19]\n"
93 "fmla v10.4s, v14.4s, v5.4s\n"
94 "ldr q20, [%[inptr0], x20]\n"
95 "fmla v12.4s, v15.4s, v1.4s\n"
96 "ldr q14, [x26]\n"
97 "fmla v11.4s, v19.4s, v5.4s\n"
98 "ldr q13, [x25, %[input_col_stride1]]\n"
99 "fmla v10.4s, v15.4s, v7.4s\n"
100 "ldr q17, [x24, x19]\n"
101 "fmla v12.4s, v19.4s, v3.4s\n"
102 "ldr q19, [x23, x20]\n"
103 "fmla v11.4s, v20.4s, v7.4s\n"
104 "ldr q18, [%[inptr0], x21]\n"
105 "fmla v10.4s, v14.4s, v2.4s\n"
106 "ldr q16, [x26, %[input_col_stride1]]\n"
107 "fmla v12.4s, v17.4s, v0.4s\n"
108 "ldr q14, [x25, x19]\n"
109 "fmla v11.4s, v17.4s, v2.4s\n"
110 "ldr q15, [x24, x20]\n"
111 "fmla v10.4s, v13.4s, v4.4s\n"
112 "ldr q13, [x23, x21]\n"
113 "str q12, [%[outptr0]]\n"
114 "fmla v9.4s, v17.4s, v8.4s\n"
115 "fmla v11.4s, v19.4s, v4.4s\n"
116 "ldr q12, [x26, x19]\n"
117 "fmla v10.4s, v17.4s, v6.4s\n"
118 "ldr q20, [x25, x20]\n"
119 "fmla v9.4s, v14.4s, v5.4s\n"
120 "ldr q17, [x24, x21]\n"
121 "fmla v11.4s, v18.4s, v6.4s\n"
122 "ldr q19, [x26, x20]\n"
123 "fmla v10.4s, v16.4s, v1.4s\n"
124 "ldr q18, [x25, x21]\n"
125 "fmla v9.4s, v15.4s, v7.4s\n"
126 "ldr q16, [x26, x21]\n"
127 "fmla v11.4s, v15.4s, v1.4s\n"
128 "add %[wbptr], %[wbptr], #160\n"
129 "fmla v10.4s, v14.4s, v3.4s\n"
130 "ldr q14, [%[wbptr]]\n"
131 "fmla v9.4s, v12.4s, v2.4s\n"
132 "ldr q8, [%[wbptr], #16]\n"
133 "fmla v11.4s, v13.4s, v3.4s\n"
134 "ldr q7, [%[wbptr], #32]\n"
135 "fmla v10.4s, v12.4s, v0.4s\n"
136 "ldr q5, [%[wbptr], #64]\n"
137 "fmla v9.4s, v20.4s, v4.4s\n"
138 "ldr q2, [%[wbptr], #112]\n"
139 "fmla v11.4s, v17.4s, v0.4s\n"
140 "prfm pldl1keep, [%[wbptr], #64]\n"
141 "str q10, [x22]\n"
142 "mov v12.16b, v14.16b\n"
143 "fmla v9.4s, v17.4s, v6.4s\n"
144 "ldr q4, [%[wbptr], #80]\n"
145 "str q11, [%[outptr0], %[output_col_stride1]]\n"
146 "mov v10.16b, v14.16b\n"
147 "mov v11.16b, v14.16b\n"
148 "add %[inptr0], %[inptr0], #16\n"
149 "fmla v9.4s, v19.4s, v1.4s\n"
150 "ldr q6, [%[wbptr], #48]\n"
151 "ldr q15, [%[inptr0]]\n"
152 "add x23, x23, #16\n"
153 "fmla v12.4s, v15.4s, v8.4s\n"
154 "ldr q20, [x23]\n"
155 "fmla v9.4s, v18.4s, v3.4s\n"
156 "ldr q1, [%[wbptr], #128]\n"
157 "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
158 "add x24, x24, #16\n"
159 "fmla v12.4s, v20.4s, v5.4s\n"
160 "ldr q17, [x24]\n"
161 "fmla v9.4s, v16.4s, v0.4s\n"
162 "ldr q3, [%[wbptr], #96]\n"
163 "fmla v10.4s, v17.4s, v8.4s\n"
164 "ldr q16, [x23, %[input_col_stride1]]\n"
165 "fmla v12.4s, v13.4s, v7.4s\n"
166 "ldr q18, [%[inptr0], x19]\n"
167 "str q9, [x22, %[output_col_stride1]]\n"
168 "add x25, x25, #16\n"
169 "mov v9.16b, v14.16b\n"
170 "ldr q0, [%[wbptr], #144]\n"
171 "fmla v12.4s, v17.4s, v2.4s\n"
172 "ldr q14, [x25]\n"
173 "ldr q15, [x24, %[input_col_stride1]]\n"
174 "add x26, x26, #16\n"
175 "add %[outptr0], %[outptr0], #16\n"
176 "add x22, x22, #16\n"
177 "subs x28, x28, #1\n"
178 "fmla v12.4s, v16.4s, v4.4s\n"
179 "fmla v12.4s, v18.4s, v6.4s\n"
180 "bne 2b\n"
181 "3:\n"
182 "fmla v11.4s, v18.4s, v8.4s\n"
183 "ldr q19, [x23, x19]\n"
184 "fmla v10.4s, v14.4s, v5.4s\n"
185 "ldr q20, [%[inptr0], x20]\n"
186 "fmla v12.4s, v15.4s, v1.4s\n"
187 "ldr q14, [x26]\n"
188 "fmla v11.4s, v19.4s, v5.4s\n"
189 "ldr q13, [x25, %[input_col_stride1]]\n"
190 "fmla v10.4s, v15.4s, v7.4s\n"
191 "ldr q17, [x24, x19]\n"
192 "fmla v12.4s, v19.4s, v3.4s\n"
193 "ldr q19, [x23, x20]\n"
194 "fmla v11.4s, v20.4s, v7.4s\n"
195 "ldr q18, [%[inptr0], x21]\n"
196 "fmla v10.4s, v14.4s, v2.4s\n"
197 "ldr q16, [x26, %[input_col_stride1]]\n"
198 "fmla v12.4s, v17.4s, v0.4s\n"
199 "ldr q14, [x25, x19]\n"
200 "fmla v11.4s, v17.4s, v2.4s\n"
201 "ldr q15, [x24, x20]\n"
202 "fmla v10.4s, v13.4s, v4.4s\n"
203 "ldr q13, [x23, x21]\n"
204 "str q12, [%[outptr0]]\n"
205 "fmla v9.4s, v17.4s, v8.4s\n"
206 "fmla v11.4s, v19.4s, v4.4s\n"
207 "ldr q12, [x26, x19]\n"
208 "fmla v10.4s, v17.4s, v6.4s\n"
209 "ldr q20, [x25, x20]\n"
210 "fmla v9.4s, v14.4s, v5.4s\n"
211 "ldr q17, [x24, x21]\n"
212 "fmla v11.4s, v18.4s, v6.4s\n"
213 "ldr q19, [x26, x20]\n"
214 "fmla v10.4s, v16.4s, v1.4s\n"
215 "ldr q18, [x25, x21]\n"
216 "fmla v9.4s, v15.4s, v7.4s\n"
217 "ldr q16, [x26, x21]\n"
218 "fmla v11.4s, v15.4s, v1.4s\n"
219 "add %[wbptr], %[wbptr], #160\n"
220 "fmla v10.4s, v14.4s, v3.4s\n"
221 "prfm pldl1keep, [%[wbptr], #64]\n"
222 "fmla v9.4s, v12.4s, v2.4s\n"
223 "add %[inptr0], %[inptr0], #16\n"
224 "fmla v11.4s, v13.4s, v3.4s\n"
225 "add x23, x23, #16\n"
226 "fmla v10.4s, v12.4s, v0.4s\n"
227 "add x24, x24, #16\n"
228 "fmla v9.4s, v20.4s, v4.4s\n"
229 "add x25, x25, #16\n"
230 "fmla v11.4s, v17.4s, v0.4s\n"
231 "add x26, x26, #16\n"
232 "str q10, [x22]\n"
233 "fmla v9.4s, v17.4s, v6.4s\n"
234 "str q11, [%[outptr0], %[output_col_stride1]]\n"
235 "add %[outptr0], %[outptr0], #16\n"
236 "fmla v9.4s, v19.4s, v1.4s\n"
237 "fmla v9.4s, v18.4s, v3.4s\n"
238 "fmla v9.4s, v16.4s, v0.4s\n"
239 "str q9, [x22, %[output_col_stride1]]\n"
240 "add x22, x22, #16\n"
241 "4:\n"
242 "cbz x27, 7f\n"
243 "ldr s14, [%[wbptr]]\n"
244 "mov v12.16b, v14.16b\n"
245 "ldr s8, [%[wbptr], #4]\n"
246 "mov v10.16b, v14.16b\n"
247 "ldr s7, [%[wbptr], #8]\n"
248 "mov v11.16b, v14.16b\n"
249 "ldr s6, [%[wbptr], #12]\n"
250 "mov v9.16b, v14.16b\n"
251 "ldr s5, [%[wbptr], #16]\n"
252 "ldr s4, [%[wbptr], #20]\n"
253 "subs x27, x27, #1\n"
254 "ldr s3, [%[wbptr], #24]\n"
255 "ldr s2, [%[wbptr], #28]\n"
256 "ldr s1, [%[wbptr], #32]\n"
257 "ldr s0, [%[wbptr], #36]\n"
258 "ldr s15, [%[inptr0]]\n"
259 "ldr s20, [x23]\n"
260 "fmla v12.4s, v15.4s, v8.4s\n"
261 "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
262 "ldr s17, [x24]\n"
263 "ldr s16, [x23, %[input_col_stride1]]\n"
264 "fmla v10.4s, v17.4s, v8.4s\n"
265 "ldr s18, [%[inptr0], x19]\n"
266 "fmla v12.4s, v20.4s, v5.4s\n"
267 "ldr s14, [x25]\n"
268 "ldr s15, [x24, %[input_col_stride1]]\n"
269 "fmla v12.4s, v13.4s, v7.4s\n"
270 "fmla v12.4s, v17.4s, v2.4s\n"
271 "fmla v12.4s, v16.4s, v4.4s\n"
272 "fmla v12.4s, v18.4s, v6.4s\n"
273 "beq 6f\n"
274 "5:\n"
275 "fmla v11.4s, v18.4s, v8.4s\n"
276 "ldr s19, [x23, x19]\n"
277 "fmla v10.4s, v14.4s, v5.4s\n"
278 "ldr s20, [%[inptr0], x20]\n"
279 "fmla v12.4s, v15.4s, v1.4s\n"
280 "ldr s14, [x26]\n"
281 "fmla v11.4s, v19.4s, v5.4s\n"
282 "ldr s13, [x25, %[input_col_stride1]]\n"
283 "fmla v10.4s, v15.4s, v7.4s\n"
284 "ldr s17, [x24, x19]\n"
285 "fmla v12.4s, v19.4s, v3.4s\n"
286 "ldr s19, [x23, x20]\n"
287 "fmla v11.4s, v20.4s, v7.4s\n"
288 "ldr s18, [%[inptr0], x21]\n"
289 "fmla v10.4s, v14.4s, v2.4s\n"
290 "ldr s16, [x26, %[input_col_stride1]]\n"
291 "fmla v12.4s, v17.4s, v0.4s\n"
292 "ldr s14, [x25, x19]\n"
293 "fmla v11.4s, v17.4s, v2.4s\n"
294 "ldr s15, [x24, x20]\n"
295 "fmla v10.4s, v13.4s, v4.4s\n"
296 "ldr s13, [x23, x21]\n"
297 "str s12, [%[outptr0]]\n"
298 "fmla v9.4s, v17.4s, v8.4s\n"
299 "fmla v11.4s, v19.4s, v4.4s\n"
300 "ldr s12, [x26, x19]\n"
301 "fmla v10.4s, v17.4s, v6.4s\n"
302 "ldr s20, [x25, x20]\n"
303 "fmla v9.4s, v14.4s, v5.4s\n"
304 "ldr s17, [x24, x21]\n"
305 "fmla v11.4s, v18.4s, v6.4s\n"
306 "ldr s19, [x26, x20]\n"
307 "fmla v10.4s, v16.4s, v1.4s\n"
308 "ldr s18, [x25, x21]\n"
309 "fmla v9.4s, v15.4s, v7.4s\n"
310 "ldr s16, [x26, x21]\n"
311 "fmla v11.4s, v15.4s, v1.4s\n"
312 "add %[wbptr], %[wbptr], #40\n"
313 "fmla v10.4s, v14.4s, v3.4s\n"
314 "ldr s14, [%[wbptr]]\n"
315 "fmla v9.4s, v12.4s, v2.4s\n"
316 "ldr s8, [%[wbptr], #4]\n"
317 "fmla v11.4s, v13.4s, v3.4s\n"
318 "ldr s7, [%[wbptr], #8]\n"
319 "fmla v10.4s, v12.4s, v0.4s\n"
320 "ldr s5, [%[wbptr], #16]\n"
321 "fmla v9.4s, v20.4s, v4.4s\n"
322 "ldr s2, [%[wbptr], #28]\n"
323 "fmla v11.4s, v17.4s, v0.4s\n"
324 "prfm pldl1keep, [%[wbptr], #64]\n"
325 "str s10, [x22]\n"
326 "mov v12.16b, v14.16b\n"
327 "fmla v9.4s, v17.4s, v6.4s\n"
328 "ldr s4, [%[wbptr], #20]\n"
329 "str s11, [%[outptr0], %[output_col_stride1]]\n"
330 "mov v10.16b, v14.16b\n"
331 "mov v11.16b, v14.16b\n"
332 "add %[inptr0], %[inptr0], #4\n"
333 "fmla v9.4s, v19.4s, v1.4s\n"
334 "ldr s6, [%[wbptr], #12]\n"
335 "ldr s15, [%[inptr0]]\n"
336 "add x23, x23, #4\n"
337 "fmla v12.4s, v15.4s, v8.4s\n"
338 "ldr s20, [x23]\n"
339 "fmla v9.4s, v18.4s, v3.4s\n"
340 "ldr s1, [%[wbptr], #32]\n"
341 "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
342 "add x24, x24, #4\n"
343 "fmla v12.4s, v20.4s, v5.4s\n"
344 "ldr s17, [x24]\n"
345 "fmla v9.4s, v16.4s, v0.4s\n"
346 "ldr s3, [%[wbptr], #24]\n"
347 "fmla v10.4s, v17.4s, v8.4s\n"
348 "ldr s16, [x23, %[input_col_stride1]]\n"
349 "fmla v12.4s, v13.4s, v7.4s\n"
350 "ldr s18, [%[inptr0], x19]\n"
351 "str s9, [x22, %[output_col_stride1]]\n"
352 "add x25, x25, #4\n"
353 "mov v9.16b, v14.16b\n"
354 "ldr s0, [%[wbptr], #36]\n"
355 "fmla v12.4s, v17.4s, v2.4s\n"
356 "ldr s14, [x25]\n"
357 "ldr s15, [x24, %[input_col_stride1]]\n"
358 "add x26, x26, #4\n"
359 "add %[outptr0], %[outptr0], #4\n"
360 "add x22, x22, #4\n"
361 "subs x27, x27, #1\n"
362 "fmla v12.4s, v16.4s, v4.4s\n"
363 "fmla v12.4s, v18.4s, v6.4s\n"
364 "bne 5b\n"
365 "6:\n"
366 "fmla v11.4s, v18.4s, v8.4s\n"
367 "ldr s19, [x23, x19]\n"
368 "fmla v10.4s, v14.4s, v5.4s\n"
369 "ldr s20, [%[inptr0], x20]\n"
370 "fmla v12.4s, v15.4s, v1.4s\n"
371 "ldr s14, [x26]\n"
372 "fmla v11.4s, v19.4s, v5.4s\n"
373 "ldr s13, [x25, %[input_col_stride1]]\n"
374 "fmla v10.4s, v15.4s, v7.4s\n"
375 "ldr s17, [x24, x19]\n"
376 "fmla v12.4s, v19.4s, v3.4s\n"
377 "ldr s19, [x23, x20]\n"
378 "fmla v11.4s, v20.4s, v7.4s\n"
379 "ldr s18, [%[inptr0], x21]\n"
380 "fmla v10.4s, v14.4s, v2.4s\n"
381 "ldr s16, [x26, %[input_col_stride1]]\n"
382 "fmla v12.4s, v17.4s, v0.4s\n"
383 "ldr s14, [x25, x19]\n"
384 "fmla v11.4s, v17.4s, v2.4s\n"
385 "ldr s15, [x24, x20]\n"
386 "fmla v10.4s, v13.4s, v4.4s\n"
387 "ldr s13, [x23, x21]\n"
388 "str s12, [%[outptr0]]\n"
389 "fmla v9.4s, v17.4s, v8.4s\n"
390 "fmla v11.4s, v19.4s, v4.4s\n"
391 "ldr s12, [x26, x19]\n"
392 "fmla v10.4s, v17.4s, v6.4s\n"
393 "ldr s20, [x25, x20]\n"
394 "fmla v9.4s, v14.4s, v5.4s\n"
395 "ldr s17, [x24, x21]\n"
396 "fmla v11.4s, v18.4s, v6.4s\n"
397 "ldr s19, [x26, x20]\n"
398 "fmla v10.4s, v16.4s, v1.4s\n"
399 "ldr s18, [x25, x21]\n"
400 "fmla v9.4s, v15.4s, v7.4s\n"
401 "ldr s16, [x26, x21]\n"
402 "fmla v11.4s, v15.4s, v1.4s\n"
403 "add %[wbptr], %[wbptr], #40\n"
404 "fmla v10.4s, v14.4s, v3.4s\n"
405 "prfm pldl1keep, [%[wbptr], #64]\n"
406 "fmla v9.4s, v12.4s, v2.4s\n"
407 "add %[inptr0], %[inptr0], #4\n"
408 "fmla v11.4s, v13.4s, v3.4s\n"
409 "add x23, x23, #4\n"
410 "fmla v10.4s, v12.4s, v0.4s\n"
411 "add x24, x24, #4\n"
412 "fmla v9.4s, v20.4s, v4.4s\n"
413 "add x25, x25, #4\n"
414 "fmla v11.4s, v17.4s, v0.4s\n"
415 "add x26, x26, #4\n"
416 "str s10, [x22]\n"
417 "fmla v9.4s, v17.4s, v6.4s\n"
418 "str s11, [%[outptr0], %[output_col_stride1]]\n"
419 "add %[outptr0], %[outptr0], #4\n"
420 "fmla v9.4s, v19.4s, v1.4s\n"
421 "fmla v9.4s, v18.4s, v3.4s\n"
422 "fmla v9.4s, v16.4s, v0.4s\n"
423 "str s9, [x22, %[output_col_stride1]]\n"
424 "add x22, x22, #4\n"
425 "7:\n"
426 : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
427 : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
428 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
429 );
430}
Georgios Pinitas4074c992018-01-30 18:13:46 +0000431
432template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000433template <>
434void Conv::execute_tile<ActivationFunction::ReLU>(
435 int n_channels,
436 const void *weight_bias_ptr,
437 const float *input,
438 const unsigned int input_row_stride,
439 const unsigned int input_col_stride,
440 float *output,
441 const unsigned int output_row_stride,
442 const unsigned int output_col_stride
443)
444{
445 __asm __volatile(
446 "add x24, %[inptr0], %[input_row_stride]\n"
447 "add x27, %[input_col_stride1], %[input_col_stride1]\n"
448 "add x19, %[outptr0], %[output_row_stride]\n"
449 "add x25, x24, %[input_row_stride]\n"
450 "add x23, x27, %[input_col_stride1]\n"
451 "and x20, %[n_channels], #3\n"
452 "add x28, x25, %[input_row_stride]\n"
453 "add x22, x23, %[input_col_stride1]\n"
454 "lsr x21, %[n_channels], #2\n"
455 "add x26, x28, %[input_row_stride]\n"
456 "cbz x21, 4f\n"
457 "1:\n"
458 "ldr q16, [%[wbptr]]\n"
459 "subs x21, x21, #1\n"
460 "mov v3.16b, v16.16b\n"
461 "ldr q4, [%[wbptr], #16]\n"
462 "mov v1.16b, v16.16b\n"
463 "ldr q5, [%[wbptr], #32]\n"
464 "mov v2.16b, v16.16b\n"
465 "ldr q12, [%[wbptr], #48]\n"
466 "mov v0.16b, v16.16b\n"
467 "ldr q11, [%[wbptr], #64]\n"
468 "ldr q10, [%[wbptr], #80]\n"
469 "ldr q6, [%[wbptr], #96]\n"
470 "ldr q9, [%[wbptr], #112]\n"
471 "ldr q8, [%[wbptr], #128]\n"
472 "ldr q7, [%[wbptr], #144]\n"
473 "ldr q21, [%[inptr0]]\n"
474 "fmla v3.4s, v21.4s, v4.4s\n"
475 "ldr q23, [x24]\n"
476 "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
477 "ldr q14, [x25]\n"
478 "fmla v1.4s, v14.4s, v4.4s\n"
479 "ldr q13, [x24, %[input_col_stride1]]\n"
480 "fmla v3.4s, v23.4s, v11.4s\n"
481 "ldr q18, [%[inptr0], x27]\n"
482 "ldr q15, [x28]\n"
483 "ldr q22, [x25, %[input_col_stride1]]\n"
484 "fmla v3.4s, v19.4s, v5.4s\n"
485 "fmla v3.4s, v14.4s, v9.4s\n"
486 "beq 3f\n"
487 "2:\n"
488 "fmla v3.4s, v13.4s, v10.4s\n"
489 "ldr q17, [x24, x27]\n"
490 "fmla v2.4s, v18.4s, v4.4s\n"
491 "ldr q20, [%[inptr0], x23]\n"
492 "fmla v1.4s, v15.4s, v11.4s\n"
493 "ldr q19, [x26]\n"
494 "fmla v3.4s, v18.4s, v12.4s\n"
495 "ldr q13, [x28, %[input_col_stride1]]\n"
496 "fmla v2.4s, v17.4s, v11.4s\n"
497 "ldr q14, [x25, x27]\n"
498 "fmla v1.4s, v22.4s, v5.4s\n"
499 "ldr q15, [x24, x23]\n"
500 "fmla v3.4s, v22.4s, v8.4s\n"
501 "ldr q16, [%[inptr0], x22]\n"
502 "fmla v2.4s, v20.4s, v5.4s\n"
503 "ldr q20, [x26, %[input_col_stride1]]\n"
504 "fmla v1.4s, v19.4s, v9.4s\n"
505 "ldr q19, [x28, x27]\n"
506 "fmla v3.4s, v17.4s, v6.4s\n"
507 "ldr q21, [x25, x23]\n"
508 "fmla v2.4s, v14.4s, v9.4s\n"
509 "ldr q22, [x24, x22]\n"
510 "fmla v1.4s, v13.4s, v10.4s\n"
511 "ldr q23, [x26, x27]\n"
512 "fmla v3.4s, v14.4s, v7.4s\n"
513 "ldr q18, [x28, x23]\n"
514 "fmla v0.4s, v14.4s, v4.4s\n"
515 "ldr q13, [x25, x22]\n"
516 "fmla v1.4s, v14.4s, v12.4s\n"
517 "ldr q14, [x26, x23]\n"
518 "fmla v2.4s, v15.4s, v10.4s\n"
519 "ldr q17, [x28, x22]\n"
520 "fmla v0.4s, v19.4s, v11.4s\n"
521 "ldr q15, [x26, x22]\n"
522 "fmla v1.4s, v20.4s, v8.4s\n"
523 "add %[wbptr], %[wbptr], #160\n"
524 "fmla v2.4s, v16.4s, v12.4s\n"
525 "ldr q16, [%[wbptr]]\n"
526 "fmla v0.4s, v21.4s, v5.4s\n"
527 "ldr q4, [%[wbptr], #16]\n"
528 "fmla v1.4s, v19.4s, v6.4s\n"
529 "ldr q11, [%[wbptr], #64]\n"
530 "fmla v2.4s, v21.4s, v8.4s\n"
531 "prfm pldl1keep, [%[wbptr], #64]\n"
532 "fmla v0.4s, v23.4s, v9.4s\n"
533 "ldr q5, [%[wbptr], #32]\n"
534 "fmla v1.4s, v23.4s, v7.4s\n"
535 "add %[inptr0], %[inptr0], #16\n"
536 "fmla v2.4s, v22.4s, v6.4s\n"
537 "ldr q21, [%[inptr0]]\n"
538 "fmla v0.4s, v18.4s, v10.4s\n"
539 "ldr q9, [%[wbptr], #112]\n"
540 "movi v20.16b, #0\n"
541 "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
542 "fmla v2.4s, v13.4s, v7.4s\n"
543 "ldr q18, [%[inptr0], x27]\n"
544 "fmla v0.4s, v13.4s, v12.4s\n"
545 "ldr q10, [%[wbptr], #80]\n"
546 "fmax v3.4s, v3.4s, v20.4s\n"
547 "add x24, x24, #16\n"
548 "fmax v2.4s, v2.4s, v20.4s\n"
549 "ldr q23, [x24]\n"
550 "str q3, [%[outptr0]]\n"
551 "fmla v0.4s, v14.4s, v8.4s\n"
552 "str q2, [%[outptr0], %[output_col_stride1]]\n"
553 "fmax v1.4s, v1.4s, v20.4s\n"
554 "mov v3.16b, v16.16b\n"
555 "ldr q12, [%[wbptr], #48]\n"
556 "str q1, [x19]\n"
557 "fmla v0.4s, v17.4s, v6.4s\n"
558 "mov v1.16b, v16.16b\n"
559 "ldr q8, [%[wbptr], #128]\n"
560 "mov v2.16b, v16.16b\n"
561 "ldr q13, [x24, %[input_col_stride1]]\n"
562 "fmla v0.4s, v15.4s, v7.4s\n"
563 "ldr q6, [%[wbptr], #96]\n"
564 "fmla v3.4s, v21.4s, v4.4s\n"
565 "add x25, x25, #16\n"
566 "ldr q14, [x25]\n"
567 "add x28, x28, #16\n"
568 "fmax v0.4s, v0.4s, v20.4s\n"
569 "ldr q7, [%[wbptr], #144]\n"
570 "fmla v3.4s, v23.4s, v11.4s\n"
571 "ldr q15, [x28]\n"
572 "str q0, [x19, %[output_col_stride1]]\n"
573 "fmla v1.4s, v14.4s, v4.4s\n"
574 "mov v0.16b, v16.16b\n"
575 "ldr q22, [x25, %[input_col_stride1]]\n"
576 "fmla v3.4s, v19.4s, v5.4s\n"
577 "add x26, x26, #16\n"
578 "add %[outptr0], %[outptr0], #16\n"
579 "add x19, x19, #16\n"
580 "subs x21, x21, #1\n"
581 "fmla v3.4s, v14.4s, v9.4s\n"
582 "bne 2b\n"
583 "3:\n"
584 "fmla v3.4s, v13.4s, v10.4s\n"
585 "ldr q17, [x24, x27]\n"
586 "fmla v2.4s, v18.4s, v4.4s\n"
587 "ldr q20, [%[inptr0], x23]\n"
588 "fmla v1.4s, v15.4s, v11.4s\n"
589 "ldr q19, [x26]\n"
590 "fmla v3.4s, v18.4s, v12.4s\n"
591 "ldr q13, [x28, %[input_col_stride1]]\n"
592 "fmla v2.4s, v17.4s, v11.4s\n"
593 "ldr q14, [x25, x27]\n"
594 "fmla v1.4s, v22.4s, v5.4s\n"
595 "ldr q15, [x24, x23]\n"
596 "fmla v3.4s, v22.4s, v8.4s\n"
597 "ldr q16, [%[inptr0], x22]\n"
598 "fmla v2.4s, v20.4s, v5.4s\n"
599 "ldr q20, [x26, %[input_col_stride1]]\n"
600 "fmla v1.4s, v19.4s, v9.4s\n"
601 "ldr q19, [x28, x27]\n"
602 "fmla v3.4s, v17.4s, v6.4s\n"
603 "ldr q21, [x25, x23]\n"
604 "fmla v2.4s, v14.4s, v9.4s\n"
605 "ldr q22, [x24, x22]\n"
606 "fmla v1.4s, v13.4s, v10.4s\n"
607 "ldr q23, [x26, x27]\n"
608 "fmla v3.4s, v14.4s, v7.4s\n"
609 "ldr q18, [x28, x23]\n"
610 "fmla v0.4s, v14.4s, v4.4s\n"
611 "ldr q13, [x25, x22]\n"
612 "fmla v1.4s, v14.4s, v12.4s\n"
613 "ldr q14, [x26, x23]\n"
614 "fmla v2.4s, v15.4s, v10.4s\n"
615 "ldr q17, [x28, x22]\n"
616 "fmla v0.4s, v19.4s, v11.4s\n"
617 "ldr q15, [x26, x22]\n"
618 "fmla v1.4s, v20.4s, v8.4s\n"
619 "add %[wbptr], %[wbptr], #160\n"
620 "fmla v2.4s, v16.4s, v12.4s\n"
621 "prfm pldl1keep, [%[wbptr], #64]\n"
622 "fmla v0.4s, v21.4s, v5.4s\n"
623 "add %[inptr0], %[inptr0], #16\n"
624 "fmla v1.4s, v19.4s, v6.4s\n"
625 "add x24, x24, #16\n"
626 "fmla v2.4s, v21.4s, v8.4s\n"
627 "add x25, x25, #16\n"
628 "fmla v0.4s, v23.4s, v9.4s\n"
629 "add x28, x28, #16\n"
630 "fmla v1.4s, v23.4s, v7.4s\n"
631 "add x26, x26, #16\n"
632 "fmla v2.4s, v22.4s, v6.4s\n"
633 "movi v20.16b, #0\n"
634 "fmla v0.4s, v18.4s, v10.4s\n"
635 "fmax v3.4s, v3.4s, v20.4s\n"
636 "fmla v2.4s, v13.4s, v7.4s\n"
637 "fmax v1.4s, v1.4s, v20.4s\n"
638 "str q3, [%[outptr0]]\n"
639 "fmla v0.4s, v13.4s, v12.4s\n"
640 "str q1, [x19]\n"
641 "fmax v2.4s, v2.4s, v20.4s\n"
642 "fmla v0.4s, v14.4s, v8.4s\n"
643 "str q2, [%[outptr0], %[output_col_stride1]]\n"
644 "add %[outptr0], %[outptr0], #16\n"
645 "fmla v0.4s, v17.4s, v6.4s\n"
646 "fmla v0.4s, v15.4s, v7.4s\n"
647 "fmax v0.4s, v0.4s, v20.4s\n"
648 "str q0, [x19, %[output_col_stride1]]\n"
649 "add x19, x19, #16\n"
650 "4:\n"
651 "cbz x20, 7f\n"
652 "ldr s16, [%[wbptr]]\n"
653 "mov v3.16b, v16.16b\n"
654 "ldr s4, [%[wbptr], #4]\n"
655 "mov v1.16b, v16.16b\n"
656 "ldr s5, [%[wbptr], #8]\n"
657 "mov v2.16b, v16.16b\n"
658 "ldr s12, [%[wbptr], #12]\n"
659 "mov v0.16b, v16.16b\n"
660 "ldr s11, [%[wbptr], #16]\n"
661 "ldr s10, [%[wbptr], #20]\n"
662 "subs x20, x20, #1\n"
663 "ldr s6, [%[wbptr], #24]\n"
664 "ldr s9, [%[wbptr], #28]\n"
665 "ldr s8, [%[wbptr], #32]\n"
666 "ldr s7, [%[wbptr], #36]\n"
667 "ldr s21, [%[inptr0]]\n"
668 "ldr s23, [x24]\n"
669 "fmla v3.4s, v21.4s, v4.4s\n"
670 "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
671 "ldr s14, [x25]\n"
672 "ldr s13, [x24, %[input_col_stride1]]\n"
673 "fmla v1.4s, v14.4s, v4.4s\n"
674 "ldr s18, [%[inptr0], x27]\n"
675 "fmla v3.4s, v23.4s, v11.4s\n"
676 "ldr s15, [x28]\n"
677 "ldr s22, [x25, %[input_col_stride1]]\n"
678 "fmla v3.4s, v19.4s, v5.4s\n"
679 "fmla v3.4s, v14.4s, v9.4s\n"
680 "beq 6f\n"
681 "5:\n"
682 "fmla v3.4s, v13.4s, v10.4s\n"
683 "ldr s17, [x24, x27]\n"
684 "fmla v2.4s, v18.4s, v4.4s\n"
685 "ldr s20, [%[inptr0], x23]\n"
686 "fmla v1.4s, v15.4s, v11.4s\n"
687 "ldr s19, [x26]\n"
688 "fmla v3.4s, v18.4s, v12.4s\n"
689 "ldr s13, [x28, %[input_col_stride1]]\n"
690 "fmla v2.4s, v17.4s, v11.4s\n"
691 "ldr s14, [x25, x27]\n"
692 "fmla v1.4s, v22.4s, v5.4s\n"
693 "ldr s15, [x24, x23]\n"
694 "fmla v3.4s, v22.4s, v8.4s\n"
695 "ldr s16, [%[inptr0], x22]\n"
696 "fmla v2.4s, v20.4s, v5.4s\n"
697 "ldr s20, [x26, %[input_col_stride1]]\n"
698 "fmla v1.4s, v19.4s, v9.4s\n"
699 "ldr s19, [x28, x27]\n"
700 "fmla v3.4s, v17.4s, v6.4s\n"
701 "ldr s21, [x25, x23]\n"
702 "fmla v2.4s, v14.4s, v9.4s\n"
703 "ldr s22, [x24, x22]\n"
704 "fmla v1.4s, v13.4s, v10.4s\n"
705 "ldr s23, [x26, x27]\n"
706 "fmla v3.4s, v14.4s, v7.4s\n"
707 "ldr s18, [x28, x23]\n"
708 "fmla v0.4s, v14.4s, v4.4s\n"
709 "ldr s13, [x25, x22]\n"
710 "fmla v1.4s, v14.4s, v12.4s\n"
711 "ldr s14, [x26, x23]\n"
712 "fmla v2.4s, v15.4s, v10.4s\n"
713 "ldr s17, [x28, x22]\n"
714 "fmla v0.4s, v19.4s, v11.4s\n"
715 "ldr s15, [x26, x22]\n"
716 "fmla v1.4s, v20.4s, v8.4s\n"
717 "add %[wbptr], %[wbptr], #40\n"
718 "fmla v2.4s, v16.4s, v12.4s\n"
719 "ldr s16, [%[wbptr]]\n"
720 "fmla v0.4s, v21.4s, v5.4s\n"
721 "ldr s4, [%[wbptr], #4]\n"
722 "fmla v1.4s, v19.4s, v6.4s\n"
723 "ldr s11, [%[wbptr], #16]\n"
724 "fmla v2.4s, v21.4s, v8.4s\n"
725 "prfm pldl1keep, [%[wbptr], #64]\n"
726 "fmla v0.4s, v23.4s, v9.4s\n"
727 "ldr s5, [%[wbptr], #8]\n"
728 "fmla v1.4s, v23.4s, v7.4s\n"
729 "add %[inptr0], %[inptr0], #4\n"
730 "fmla v2.4s, v22.4s, v6.4s\n"
731 "ldr s21, [%[inptr0]]\n"
732 "fmla v0.4s, v18.4s, v10.4s\n"
733 "ldr s9, [%[wbptr], #28]\n"
734 "movi v20.16b, #0\n"
735 "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
736 "fmla v2.4s, v13.4s, v7.4s\n"
737 "ldr s18, [%[inptr0], x27]\n"
738 "fmla v0.4s, v13.4s, v12.4s\n"
739 "ldr s10, [%[wbptr], #20]\n"
740 "fmax v3.4s, v3.4s, v20.4s\n"
741 "add x24, x24, #4\n"
742 "fmax v2.4s, v2.4s, v20.4s\n"
743 "ldr s23, [x24]\n"
744 "str s3, [%[outptr0]]\n"
745 "fmla v0.4s, v14.4s, v8.4s\n"
746 "str s2, [%[outptr0], %[output_col_stride1]]\n"
747 "fmax v1.4s, v1.4s, v20.4s\n"
748 "mov v3.16b, v16.16b\n"
749 "ldr s12, [%[wbptr], #12]\n"
750 "str s1, [x19]\n"
751 "fmla v0.4s, v17.4s, v6.4s\n"
752 "mov v1.16b, v16.16b\n"
753 "ldr s8, [%[wbptr], #32]\n"
754 "mov v2.16b, v16.16b\n"
755 "ldr s13, [x24, %[input_col_stride1]]\n"
756 "fmla v0.4s, v15.4s, v7.4s\n"
757 "ldr s6, [%[wbptr], #24]\n"
758 "fmla v3.4s, v21.4s, v4.4s\n"
759 "add x25, x25, #4\n"
760 "ldr s14, [x25]\n"
761 "add x28, x28, #4\n"
762 "fmax v0.4s, v0.4s, v20.4s\n"
763 "ldr s7, [%[wbptr], #36]\n"
764 "fmla v3.4s, v23.4s, v11.4s\n"
765 "ldr s15, [x28]\n"
766 "str s0, [x19, %[output_col_stride1]]\n"
767 "fmla v1.4s, v14.4s, v4.4s\n"
768 "mov v0.16b, v16.16b\n"
769 "ldr s22, [x25, %[input_col_stride1]]\n"
770 "fmla v3.4s, v19.4s, v5.4s\n"
771 "add x26, x26, #4\n"
772 "add %[outptr0], %[outptr0], #4\n"
773 "add x19, x19, #4\n"
774 "subs x20, x20, #1\n"
775 "fmla v3.4s, v14.4s, v9.4s\n"
776 "bne 5b\n"
777 "6:\n"
778 "fmla v3.4s, v13.4s, v10.4s\n"
779 "ldr s17, [x24, x27]\n"
780 "fmla v2.4s, v18.4s, v4.4s\n"
781 "ldr s20, [%[inptr0], x23]\n"
782 "fmla v1.4s, v15.4s, v11.4s\n"
783 "ldr s19, [x26]\n"
784 "fmla v3.4s, v18.4s, v12.4s\n"
785 "ldr s13, [x28, %[input_col_stride1]]\n"
786 "fmla v2.4s, v17.4s, v11.4s\n"
787 "ldr s14, [x25, x27]\n"
788 "fmla v1.4s, v22.4s, v5.4s\n"
789 "ldr s15, [x24, x23]\n"
790 "fmla v3.4s, v22.4s, v8.4s\n"
791 "ldr s16, [%[inptr0], x22]\n"
792 "fmla v2.4s, v20.4s, v5.4s\n"
793 "ldr s20, [x26, %[input_col_stride1]]\n"
794 "fmla v1.4s, v19.4s, v9.4s\n"
795 "ldr s19, [x28, x27]\n"
796 "fmla v3.4s, v17.4s, v6.4s\n"
797 "ldr s21, [x25, x23]\n"
798 "fmla v2.4s, v14.4s, v9.4s\n"
799 "ldr s22, [x24, x22]\n"
800 "fmla v1.4s, v13.4s, v10.4s\n"
801 "ldr s23, [x26, x27]\n"
802 "fmla v3.4s, v14.4s, v7.4s\n"
803 "ldr s18, [x28, x23]\n"
804 "fmla v0.4s, v14.4s, v4.4s\n"
805 "ldr s13, [x25, x22]\n"
806 "fmla v1.4s, v14.4s, v12.4s\n"
807 "ldr s14, [x26, x23]\n"
808 "fmla v2.4s, v15.4s, v10.4s\n"
809 "ldr s17, [x28, x22]\n"
810 "fmla v0.4s, v19.4s, v11.4s\n"
811 "ldr s15, [x26, x22]\n"
812 "fmla v1.4s, v20.4s, v8.4s\n"
813 "add %[wbptr], %[wbptr], #40\n"
814 "fmla v2.4s, v16.4s, v12.4s\n"
815 "prfm pldl1keep, [%[wbptr], #64]\n"
816 "fmla v0.4s, v21.4s, v5.4s\n"
817 "add %[inptr0], %[inptr0], #4\n"
818 "fmla v1.4s, v19.4s, v6.4s\n"
819 "add x24, x24, #4\n"
820 "fmla v2.4s, v21.4s, v8.4s\n"
821 "add x25, x25, #4\n"
822 "fmla v0.4s, v23.4s, v9.4s\n"
823 "add x28, x28, #4\n"
824 "fmla v1.4s, v23.4s, v7.4s\n"
825 "add x26, x26, #4\n"
826 "fmla v2.4s, v22.4s, v6.4s\n"
827 "movi v20.16b, #0\n"
828 "fmla v0.4s, v18.4s, v10.4s\n"
829 "fmax v3.4s, v3.4s, v20.4s\n"
830 "fmla v2.4s, v13.4s, v7.4s\n"
831 "fmax v1.4s, v1.4s, v20.4s\n"
832 "str s3, [%[outptr0]]\n"
833 "fmla v0.4s, v13.4s, v12.4s\n"
834 "str s1, [x19]\n"
835 "fmax v2.4s, v2.4s, v20.4s\n"
836 "fmla v0.4s, v14.4s, v8.4s\n"
837 "str s2, [%[outptr0], %[output_col_stride1]]\n"
838 "add %[outptr0], %[outptr0], #4\n"
839 "fmla v0.4s, v17.4s, v6.4s\n"
840 "fmla v0.4s, v15.4s, v7.4s\n"
841 "fmax v0.4s, v0.4s, v20.4s\n"
842 "str s0, [x19, %[output_col_stride1]]\n"
843 "add x19, x19, #4\n"
844 "7:\n"
845 : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
846 : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
847 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
848 );
849}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000850
851template <>
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000852template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000853void Conv::execute_tile<ActivationFunction::ReLU6>(
854 int n_channels,
855 const void *weight_bias_ptr,
856 const float *input,
857 const unsigned int input_row_stride,
858 const unsigned int input_col_stride,
859 float *output,
860 const unsigned int output_row_stride,
861 const unsigned int output_col_stride
862)
863{
864 __asm __volatile(
865 "add x21, %[inptr0], %[input_row_stride]\n"
866 "add x23, %[input_col_stride1], %[input_col_stride1]\n"
867 "add x24, %[outptr0], %[output_row_stride]\n"
868 "add x28, x21, %[input_row_stride]\n"
869 "add x26, x23, %[input_col_stride1]\n"
870 "and x19, %[n_channels], #3\n"
871 "add x27, x28, %[input_row_stride]\n"
872 "add x25, x26, %[input_col_stride1]\n"
873 "lsr x20, %[n_channels], #2\n"
874 "add x22, x27, %[input_row_stride]\n"
875 "cbz x20, 4f\n"
876 "1:\n"
877 "ldr q14, [%[wbptr]]\n"
878 "subs x20, x20, #1\n"
879 "mov v5.16b, v14.16b\n"
880 "ldr q0, [%[wbptr], #16]\n"
881 "mov v11.16b, v14.16b\n"
882 "ldr q1, [%[wbptr], #32]\n"
883 "mov v12.16b, v14.16b\n"
884 "ldr q2, [%[wbptr], #48]\n"
885 "mov v10.16b, v14.16b\n"
886 "ldr q6, [%[wbptr], #64]\n"
887 "ldr q3, [%[wbptr], #80]\n"
888 "ldr q7, [%[wbptr], #96]\n"
889 "ldr q4, [%[wbptr], #112]\n"
890 "ldr q8, [%[wbptr], #128]\n"
891 "ldr q9, [%[wbptr], #144]\n"
892 "ldr q19, [%[inptr0]]\n"
893 "fmla v5.4s, v19.4s, v0.4s\n"
894 "ldr q15, [x21]\n"
895 "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
896 "ldr q16, [x28]\n"
897 "fmla v11.4s, v16.4s, v0.4s\n"
898 "ldr q23, [x21, %[input_col_stride1]]\n"
899 "fmla v5.4s, v15.4s, v6.4s\n"
900 "ldr q18, [%[inptr0], x23]\n"
901 "ldr q17, [x27]\n"
902 "ldr q13, [x28, %[input_col_stride1]]\n"
903 "fmla v5.4s, v21.4s, v1.4s\n"
904 "fmla v5.4s, v16.4s, v4.4s\n"
905 "beq 3f\n"
906 "2:\n"
907 "fmla v5.4s, v23.4s, v3.4s\n"
908 "ldr q21, [x21, x23]\n"
909 "fmla v12.4s, v18.4s, v0.4s\n"
910 "ldr q20, [%[inptr0], x26]\n"
911 "fmla v11.4s, v17.4s, v6.4s\n"
912 "ldr q19, [x22]\n"
913 "fmla v5.4s, v18.4s, v2.4s\n"
914 "ldr q15, [x27, %[input_col_stride1]]\n"
915 "fmla v12.4s, v21.4s, v6.4s\n"
916 "ldr q16, [x28, x23]\n"
917 "fmla v11.4s, v13.4s, v1.4s\n"
918 "ldr q17, [x21, x26]\n"
919 "fmla v5.4s, v13.4s, v8.4s\n"
920 "ldr q14, [%[inptr0], x25]\n"
921 "fmla v12.4s, v20.4s, v1.4s\n"
922 "ldr q20, [x22, %[input_col_stride1]]\n"
923 "fmla v11.4s, v19.4s, v4.4s\n"
924 "ldr q19, [x27, x23]\n"
925 "fmla v5.4s, v21.4s, v7.4s\n"
926 "ldr q22, [x28, x26]\n"
927 "fmla v12.4s, v16.4s, v4.4s\n"
928 "ldr q21, [x21, x25]\n"
929 "fmla v11.4s, v15.4s, v3.4s\n"
930 "ldr q23, [x22, x23]\n"
931 "fmla v5.4s, v16.4s, v9.4s\n"
932 "ldr q18, [x27, x26]\n"
933 "fmla v10.4s, v16.4s, v0.4s\n"
934 "ldr q15, [x28, x25]\n"
935 "fmla v11.4s, v16.4s, v2.4s\n"
936 "ldr q16, [x22, x26]\n"
937 "fmla v12.4s, v17.4s, v3.4s\n"
938 "ldr q17, [x27, x25]\n"
939 "fmla v10.4s, v19.4s, v6.4s\n"
940 "ldr q13, [x22, x25]\n"
941 "fmla v11.4s, v20.4s, v8.4s\n"
942 "add %[wbptr], %[wbptr], #160\n"
943 "fmla v12.4s, v14.4s, v2.4s\n"
944 "ldr q14, [%[wbptr]]\n"
945 "fmla v10.4s, v22.4s, v1.4s\n"
946 "ldr q0, [%[wbptr], #16]\n"
947 "fmla v11.4s, v19.4s, v7.4s\n"
948 "ldr q6, [%[wbptr], #64]\n"
949 "fmla v12.4s, v22.4s, v8.4s\n"
950 "prfm pldl1keep, [%[wbptr], #64]\n"
951 "fmla v10.4s, v23.4s, v4.4s\n"
952 "ldr q1, [%[wbptr], #32]\n"
953 "fmla v11.4s, v23.4s, v9.4s\n"
954 "add %[inptr0], %[inptr0], #16\n"
955 "fmla v12.4s, v21.4s, v7.4s\n"
956 "ldr q19, [%[inptr0]]\n"
957 "fmla v10.4s, v18.4s, v3.4s\n"
958 "ldr q4, [%[wbptr], #112]\n"
959 "movi v20.16b, #0\n"
960 "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
961 "fmla v12.4s, v15.4s, v9.4s\n"
962 "ldr q18, [%[inptr0], x23]\n"
963 "fmla v10.4s, v15.4s, v2.4s\n"
964 "ldr q3, [%[wbptr], #80]\n"
965 "fmov v22.4s, #6.0\n"
966 "add x21, x21, #16\n"
967 "fmax v5.4s, v5.4s, v20.4s\n"
968 "ldr q15, [x21]\n"
969 "fmla v10.4s, v16.4s, v8.4s\n"
970 "ldr q2, [%[wbptr], #48]\n"
971 "fmin v5.4s, v5.4s, v22.4s\n"
972 "ldr q23, [x21, %[input_col_stride1]]\n"
973 "fmax v12.4s, v12.4s, v20.4s\n"
974 "add x28, x28, #16\n"
975 "str q5, [%[outptr0]]\n"
976 "fmla v10.4s, v17.4s, v7.4s\n"
977 "fmin v12.4s, v12.4s, v22.4s\n"
978 "ldr q8, [%[wbptr], #128]\n"
979 "fmax v11.4s, v11.4s, v20.4s\n"
980 "ldr q16, [x28]\n"
981 "str q12, [%[outptr0], %[output_col_stride1]]\n"
982 "fmla v10.4s, v13.4s, v9.4s\n"
983 "fmin v11.4s, v11.4s, v22.4s\n"
984 "ldr q7, [%[wbptr], #96]\n"
985 "mov v5.16b, v14.16b\n"
986 "ldr q13, [x28, %[input_col_stride1]]\n"
987 "str q11, [x24]\n"
988 "fmax v10.4s, v10.4s, v20.4s\n"
989 "mov v11.16b, v14.16b\n"
990 "ldr q9, [%[wbptr], #144]\n"
991 "fmin v10.4s, v10.4s, v22.4s\n"
992 "add x27, x27, #16\n"
993 "mov v12.16b, v14.16b\n"
994 "ldr q17, [x27]\n"
995 "str q10, [x24, %[output_col_stride1]]\n"
996 "fmla v5.4s, v19.4s, v0.4s\n"
997 "mov v10.16b, v14.16b\n"
998 "add x22, x22, #16\n"
999 "fmla v11.4s, v16.4s, v0.4s\n"
1000 "add %[outptr0], %[outptr0], #16\n"
1001 "fmla v5.4s, v15.4s, v6.4s\n"
1002 "add x24, x24, #16\n"
1003 "subs x20, x20, #1\n"
1004 "fmla v5.4s, v21.4s, v1.4s\n"
1005 "fmla v5.4s, v16.4s, v4.4s\n"
1006 "bne 2b\n"
1007 "3:\n"
1008 "fmla v5.4s, v23.4s, v3.4s\n"
1009 "ldr q21, [x21, x23]\n"
1010 "fmla v12.4s, v18.4s, v0.4s\n"
1011 "ldr q20, [%[inptr0], x26]\n"
1012 "fmla v11.4s, v17.4s, v6.4s\n"
1013 "ldr q19, [x22]\n"
1014 "fmla v5.4s, v18.4s, v2.4s\n"
1015 "ldr q15, [x27, %[input_col_stride1]]\n"
1016 "fmla v12.4s, v21.4s, v6.4s\n"
1017 "ldr q16, [x28, x23]\n"
1018 "fmla v11.4s, v13.4s, v1.4s\n"
1019 "ldr q17, [x21, x26]\n"
1020 "fmla v5.4s, v13.4s, v8.4s\n"
1021 "ldr q14, [%[inptr0], x25]\n"
1022 "fmla v12.4s, v20.4s, v1.4s\n"
1023 "ldr q20, [x22, %[input_col_stride1]]\n"
1024 "fmla v11.4s, v19.4s, v4.4s\n"
1025 "ldr q19, [x27, x23]\n"
1026 "fmla v5.4s, v21.4s, v7.4s\n"
1027 "ldr q22, [x28, x26]\n"
1028 "fmla v12.4s, v16.4s, v4.4s\n"
1029 "ldr q21, [x21, x25]\n"
1030 "fmla v11.4s, v15.4s, v3.4s\n"
1031 "ldr q23, [x22, x23]\n"
1032 "fmla v5.4s, v16.4s, v9.4s\n"
1033 "ldr q18, [x27, x26]\n"
1034 "fmla v10.4s, v16.4s, v0.4s\n"
1035 "ldr q15, [x28, x25]\n"
1036 "fmla v11.4s, v16.4s, v2.4s\n"
1037 "ldr q16, [x22, x26]\n"
1038 "fmla v12.4s, v17.4s, v3.4s\n"
1039 "ldr q17, [x27, x25]\n"
1040 "fmla v10.4s, v19.4s, v6.4s\n"
1041 "ldr q13, [x22, x25]\n"
1042 "fmla v11.4s, v20.4s, v8.4s\n"
1043 "add %[wbptr], %[wbptr], #160\n"
1044 "fmla v12.4s, v14.4s, v2.4s\n"
1045 "prfm pldl1keep, [%[wbptr], #64]\n"
1046 "fmla v10.4s, v22.4s, v1.4s\n"
1047 "add %[inptr0], %[inptr0], #16\n"
1048 "fmla v11.4s, v19.4s, v7.4s\n"
1049 "add x21, x21, #16\n"
1050 "fmla v12.4s, v22.4s, v8.4s\n"
1051 "add x28, x28, #16\n"
1052 "fmla v10.4s, v23.4s, v4.4s\n"
1053 "add x27, x27, #16\n"
1054 "fmla v11.4s, v23.4s, v9.4s\n"
1055 "add x22, x22, #16\n"
1056 "fmla v12.4s, v21.4s, v7.4s\n"
1057 "movi v20.16b, #0\n"
1058 "fmla v10.4s, v18.4s, v3.4s\n"
1059 "fmov v22.4s, #6.0\n"
1060 "fmax v5.4s, v5.4s, v20.4s\n"
1061 "fmax v11.4s, v11.4s, v20.4s\n"
1062 "fmla v12.4s, v15.4s, v9.4s\n"
1063 "fmla v10.4s, v15.4s, v2.4s\n"
1064 "fmin v5.4s, v5.4s, v22.4s\n"
1065 "fmin v11.4s, v11.4s, v22.4s\n"
1066 "fmax v12.4s, v12.4s, v20.4s\n"
1067 "str q5, [%[outptr0]]\n"
1068 "str q11, [x24]\n"
1069 "fmla v10.4s, v16.4s, v8.4s\n"
1070 "fmin v12.4s, v12.4s, v22.4s\n"
1071 "str q12, [%[outptr0], %[output_col_stride1]]\n"
1072 "fmla v10.4s, v17.4s, v7.4s\n"
1073 "add %[outptr0], %[outptr0], #16\n"
1074 "fmla v10.4s, v13.4s, v9.4s\n"
1075 "fmax v10.4s, v10.4s, v20.4s\n"
1076 "fmin v10.4s, v10.4s, v22.4s\n"
1077 "str q10, [x24, %[output_col_stride1]]\n"
1078 "add x24, x24, #16\n"
1079 "4:\n"
1080 "cbz x19, 7f\n"
1081 "ldr s14, [%[wbptr]]\n"
1082 "mov v5.16b, v14.16b\n"
1083 "ldr s0, [%[wbptr], #4]\n"
1084 "mov v11.16b, v14.16b\n"
1085 "ldr s1, [%[wbptr], #8]\n"
1086 "mov v12.16b, v14.16b\n"
1087 "ldr s2, [%[wbptr], #12]\n"
1088 "mov v10.16b, v14.16b\n"
1089 "ldr s6, [%[wbptr], #16]\n"
1090 "ldr s3, [%[wbptr], #20]\n"
1091 "subs x19, x19, #1\n"
1092 "ldr s7, [%[wbptr], #24]\n"
1093 "ldr s4, [%[wbptr], #28]\n"
1094 "ldr s8, [%[wbptr], #32]\n"
1095 "ldr s9, [%[wbptr], #36]\n"
1096 "ldr s19, [%[inptr0]]\n"
1097 "ldr s15, [x21]\n"
1098 "fmla v5.4s, v19.4s, v0.4s\n"
1099 "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
1100 "ldr s16, [x28]\n"
1101 "ldr s23, [x21, %[input_col_stride1]]\n"
1102 "fmla v11.4s, v16.4s, v0.4s\n"
1103 "ldr s18, [%[inptr0], x23]\n"
1104 "fmla v5.4s, v15.4s, v6.4s\n"
1105 "ldr s17, [x27]\n"
1106 "ldr s13, [x28, %[input_col_stride1]]\n"
1107 "fmla v5.4s, v21.4s, v1.4s\n"
1108 "fmla v5.4s, v16.4s, v4.4s\n"
1109 "beq 6f\n"
1110 "5:\n"
1111 "fmla v5.4s, v23.4s, v3.4s\n"
1112 "ldr s21, [x21, x23]\n"
1113 "fmla v12.4s, v18.4s, v0.4s\n"
1114 "ldr s20, [%[inptr0], x26]\n"
1115 "fmla v11.4s, v17.4s, v6.4s\n"
1116 "ldr s19, [x22]\n"
1117 "fmla v5.4s, v18.4s, v2.4s\n"
1118 "ldr s15, [x27, %[input_col_stride1]]\n"
1119 "fmla v12.4s, v21.4s, v6.4s\n"
1120 "ldr s16, [x28, x23]\n"
1121 "fmla v11.4s, v13.4s, v1.4s\n"
1122 "ldr s17, [x21, x26]\n"
1123 "fmla v5.4s, v13.4s, v8.4s\n"
1124 "ldr s14, [%[inptr0], x25]\n"
1125 "fmla v12.4s, v20.4s, v1.4s\n"
1126 "ldr s20, [x22, %[input_col_stride1]]\n"
1127 "fmla v11.4s, v19.4s, v4.4s\n"
1128 "ldr s19, [x27, x23]\n"
1129 "fmla v5.4s, v21.4s, v7.4s\n"
1130 "ldr s22, [x28, x26]\n"
1131 "fmla v12.4s, v16.4s, v4.4s\n"
1132 "ldr s21, [x21, x25]\n"
1133 "fmla v11.4s, v15.4s, v3.4s\n"
1134 "ldr s23, [x22, x23]\n"
1135 "fmla v5.4s, v16.4s, v9.4s\n"
1136 "ldr s18, [x27, x26]\n"
1137 "fmla v10.4s, v16.4s, v0.4s\n"
1138 "ldr s15, [x28, x25]\n"
1139 "fmla v11.4s, v16.4s, v2.4s\n"
1140 "ldr s16, [x22, x26]\n"
1141 "fmla v12.4s, v17.4s, v3.4s\n"
1142 "ldr s17, [x27, x25]\n"
1143 "fmla v10.4s, v19.4s, v6.4s\n"
1144 "ldr s13, [x22, x25]\n"
1145 "fmla v11.4s, v20.4s, v8.4s\n"
1146 "add %[wbptr], %[wbptr], #40\n"
1147 "fmla v12.4s, v14.4s, v2.4s\n"
1148 "ldr s14, [%[wbptr]]\n"
1149 "fmla v10.4s, v22.4s, v1.4s\n"
1150 "ldr s0, [%[wbptr], #4]\n"
1151 "fmla v11.4s, v19.4s, v7.4s\n"
1152 "ldr s6, [%[wbptr], #16]\n"
1153 "fmla v12.4s, v22.4s, v8.4s\n"
1154 "prfm pldl1keep, [%[wbptr], #64]\n"
1155 "fmla v10.4s, v23.4s, v4.4s\n"
1156 "ldr s1, [%[wbptr], #8]\n"
1157 "fmla v11.4s, v23.4s, v9.4s\n"
1158 "add %[inptr0], %[inptr0], #4\n"
1159 "fmla v12.4s, v21.4s, v7.4s\n"
1160 "ldr s19, [%[inptr0]]\n"
1161 "fmla v10.4s, v18.4s, v3.4s\n"
1162 "ldr s4, [%[wbptr], #28]\n"
1163 "movi v20.16b, #0\n"
1164 "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
1165 "fmla v12.4s, v15.4s, v9.4s\n"
1166 "ldr s18, [%[inptr0], x23]\n"
1167 "fmla v10.4s, v15.4s, v2.4s\n"
1168 "ldr s3, [%[wbptr], #20]\n"
1169 "fmov v22.4s, #6.0\n"
1170 "add x21, x21, #4\n"
1171 "fmax v5.4s, v5.4s, v20.4s\n"
1172 "ldr s15, [x21]\n"
1173 "fmla v10.4s, v16.4s, v8.4s\n"
1174 "ldr s2, [%[wbptr], #12]\n"
1175 "fmin v5.4s, v5.4s, v22.4s\n"
1176 "ldr s23, [x21, %[input_col_stride1]]\n"
1177 "fmax v12.4s, v12.4s, v20.4s\n"
1178 "add x28, x28, #4\n"
1179 "str s5, [%[outptr0]]\n"
1180 "fmla v10.4s, v17.4s, v7.4s\n"
1181 "fmin v12.4s, v12.4s, v22.4s\n"
1182 "ldr s8, [%[wbptr], #32]\n"
1183 "fmax v11.4s, v11.4s, v20.4s\n"
1184 "ldr s16, [x28]\n"
1185 "str s12, [%[outptr0], %[output_col_stride1]]\n"
1186 "fmla v10.4s, v13.4s, v9.4s\n"
1187 "fmin v11.4s, v11.4s, v22.4s\n"
1188 "ldr s7, [%[wbptr], #24]\n"
1189 "mov v5.16b, v14.16b\n"
1190 "ldr s13, [x28, %[input_col_stride1]]\n"
1191 "str s11, [x24]\n"
1192 "fmax v10.4s, v10.4s, v20.4s\n"
1193 "mov v11.16b, v14.16b\n"
1194 "ldr s9, [%[wbptr], #36]\n"
1195 "fmin v10.4s, v10.4s, v22.4s\n"
1196 "add x27, x27, #4\n"
1197 "mov v12.16b, v14.16b\n"
1198 "ldr s17, [x27]\n"
1199 "str s10, [x24, %[output_col_stride1]]\n"
1200 "fmla v5.4s, v19.4s, v0.4s\n"
1201 "mov v10.16b, v14.16b\n"
1202 "add x22, x22, #4\n"
1203 "fmla v11.4s, v16.4s, v0.4s\n"
1204 "add %[outptr0], %[outptr0], #4\n"
1205 "fmla v5.4s, v15.4s, v6.4s\n"
1206 "add x24, x24, #4\n"
1207 "subs x19, x19, #1\n"
1208 "fmla v5.4s, v21.4s, v1.4s\n"
1209 "fmla v5.4s, v16.4s, v4.4s\n"
1210 "bne 5b\n"
1211 "6:\n"
1212 "fmla v5.4s, v23.4s, v3.4s\n"
1213 "ldr s21, [x21, x23]\n"
1214 "fmla v12.4s, v18.4s, v0.4s\n"
1215 "ldr s20, [%[inptr0], x26]\n"
1216 "fmla v11.4s, v17.4s, v6.4s\n"
1217 "ldr s19, [x22]\n"
1218 "fmla v5.4s, v18.4s, v2.4s\n"
1219 "ldr s15, [x27, %[input_col_stride1]]\n"
1220 "fmla v12.4s, v21.4s, v6.4s\n"
1221 "ldr s16, [x28, x23]\n"
1222 "fmla v11.4s, v13.4s, v1.4s\n"
1223 "ldr s17, [x21, x26]\n"
1224 "fmla v5.4s, v13.4s, v8.4s\n"
1225 "ldr s14, [%[inptr0], x25]\n"
1226 "fmla v12.4s, v20.4s, v1.4s\n"
1227 "ldr s20, [x22, %[input_col_stride1]]\n"
1228 "fmla v11.4s, v19.4s, v4.4s\n"
1229 "ldr s19, [x27, x23]\n"
1230 "fmla v5.4s, v21.4s, v7.4s\n"
1231 "ldr s22, [x28, x26]\n"
1232 "fmla v12.4s, v16.4s, v4.4s\n"
1233 "ldr s21, [x21, x25]\n"
1234 "fmla v11.4s, v15.4s, v3.4s\n"
1235 "ldr s23, [x22, x23]\n"
1236 "fmla v5.4s, v16.4s, v9.4s\n"
1237 "ldr s18, [x27, x26]\n"
1238 "fmla v10.4s, v16.4s, v0.4s\n"
1239 "ldr s15, [x28, x25]\n"
1240 "fmla v11.4s, v16.4s, v2.4s\n"
1241 "ldr s16, [x22, x26]\n"
1242 "fmla v12.4s, v17.4s, v3.4s\n"
1243 "ldr s17, [x27, x25]\n"
1244 "fmla v10.4s, v19.4s, v6.4s\n"
1245 "ldr s13, [x22, x25]\n"
1246 "fmla v11.4s, v20.4s, v8.4s\n"
1247 "add %[wbptr], %[wbptr], #40\n"
1248 "fmla v12.4s, v14.4s, v2.4s\n"
1249 "prfm pldl1keep, [%[wbptr], #64]\n"
1250 "fmla v10.4s, v22.4s, v1.4s\n"
1251 "add %[inptr0], %[inptr0], #4\n"
1252 "fmla v11.4s, v19.4s, v7.4s\n"
1253 "add x21, x21, #4\n"
1254 "fmla v12.4s, v22.4s, v8.4s\n"
1255 "add x28, x28, #4\n"
1256 "fmla v10.4s, v23.4s, v4.4s\n"
1257 "add x27, x27, #4\n"
1258 "fmla v11.4s, v23.4s, v9.4s\n"
1259 "add x22, x22, #4\n"
1260 "fmla v12.4s, v21.4s, v7.4s\n"
1261 "movi v20.16b, #0\n"
1262 "fmla v10.4s, v18.4s, v3.4s\n"
1263 "fmov v22.4s, #6.0\n"
1264 "fmax v5.4s, v5.4s, v20.4s\n"
1265 "fmax v11.4s, v11.4s, v20.4s\n"
1266 "fmla v12.4s, v15.4s, v9.4s\n"
1267 "fmla v10.4s, v15.4s, v2.4s\n"
1268 "fmin v5.4s, v5.4s, v22.4s\n"
1269 "fmin v11.4s, v11.4s, v22.4s\n"
1270 "fmax v12.4s, v12.4s, v20.4s\n"
1271 "str s5, [%[outptr0]]\n"
1272 "str s11, [x24]\n"
1273 "fmla v10.4s, v16.4s, v8.4s\n"
1274 "fmin v12.4s, v12.4s, v22.4s\n"
1275 "str s12, [%[outptr0], %[output_col_stride1]]\n"
1276 "fmla v10.4s, v17.4s, v7.4s\n"
1277 "add %[outptr0], %[outptr0], #4\n"
1278 "fmla v10.4s, v13.4s, v9.4s\n"
1279 "fmax v10.4s, v10.4s, v20.4s\n"
1280 "fmin v10.4s, v10.4s, v22.4s\n"
1281 "str s10, [x24, %[output_col_stride1]]\n"
1282 "add x24, x24, #4\n"
1283 "7:\n"
1284 : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
1285 : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
1286 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
1287 );
1288}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00001289
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001290#endif // __aarch64__
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00001291
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001292template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +00001293
Georgios Pinitas4074c992018-01-30 18:13:46 +00001294} // namespace depthwise