blob: eb2b37a5b07affa5c867bdcfc136388911f8c9d3 [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas20c246a2018-09-12 16:45:53 +010024#include "impl_fp32_fp32.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000025
26namespace depthwise
27{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000028
29using namespace neon_convolution_kernels;
30using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
Georgios Pinitas4074c992018-01-30 18:13:46 +000031
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000032#ifdef __aarch64__
Georgios Pinitas4074c992018-01-30 18:13:46 +000033template <>
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000034template <>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000035void Conv::execute_tile<ActivationFunction::None>(
36 int n_channels,
37 const void* weight_bias_ptr,
38 const float* input,
39 const unsigned int input_row_stride,
40 const unsigned int input_col_stride,
41 float* output,
42 const unsigned int output_row_stride,
43 const unsigned int output_col_stride
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000044)
45{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000046 __asm __volatile(
47 "add x15, %[inptr0], %[input_row_stride]\n"
48 "add x26, %[input_col_stride1], %[input_col_stride1]\n"
49 "add x21, %[outptr0], %[output_row_stride]\n"
50 "add x16, x15, %[input_row_stride]\n"
51 "add x27, x26, %[input_col_stride1]\n"
52 "add x22, x21, %[output_row_stride]\n"
53 "add x17, x16, %[input_row_stride]\n"
54 "add x28, x27, %[input_col_stride1]\n"
55 "add x23, %[output_col_stride1], %[output_col_stride1]\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000056 "add x9, x17, %[input_row_stride]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000057 "add x13, x28, %[input_col_stride1]\n"
58 "and x24, %[n_channels], #3\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +000059 "add x19, x9, %[input_row_stride]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000060 "add x14, x13, %[input_col_stride1]\n"
61 "lsr x25, %[n_channels], #2\n"
62 "add x20, x19, %[input_row_stride]\n"
63 "cbz x25, 4f\n"
64 "1:\n"
65 "ldr q27, [%[wbptr]]\n"
66 "subs x25, x25, #1\n"
67 "mov v17.16b, v27.16b\n"
68 "ldr q6, [%[wbptr], #16]\n"
69 "mov v16.16b, v27.16b\n"
70 "ldr q14, [%[wbptr], #32]\n"
71 "mov v15.16b, v27.16b\n"
72 "ldr q13, [%[wbptr], #48]\n"
73 "mov v2.16b, v27.16b\n"
74 "ldr q12, [%[wbptr], #64]\n"
75 "mov v4.16b, v27.16b\n"
76 "ldr q11, [%[wbptr], #80]\n"
77 "mov v5.16b, v27.16b\n"
78 "ldr q10, [%[wbptr], #96]\n"
79 "mov v1.16b, v27.16b\n"
80 "ldr q9, [%[wbptr], #112]\n"
81 "mov v3.16b, v27.16b\n"
82 "ldr q8, [%[wbptr], #128]\n"
83 "mov v0.16b, v27.16b\n"
84 "ldr q7, [%[wbptr], #144]\n"
85 "ldr q29, [%[inptr0]]\n"
86 "ldr q28, [x15]\n"
87 "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
88 "ldr q22, [x16]\n"
89 "ldr q20, [x15, %[input_col_stride1]]\n"
90 "ldr q19, [%[inptr0], x26]\n"
91 "ldr q30, [x17]\n"
92 "ldr q18, [x16, %[input_col_stride1]]\n"
93 "beq 3f\n"
94 "2:\n"
95 "fmla v17.4s, v29.4s, v6.4s\n"
96 "ldr q21, [x15, x26]\n"
97 "fmla v16.4s, v22.4s, v6.4s\n"
98 "ldr q27, [%[inptr0], x27]\n"
99 "fmla v15.4s, v19.4s, v6.4s\n"
100 "add %[wbptr], %[wbptr], #160\n"
101 "fmla v17.4s, v28.4s, v12.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000102 "ldr q25, [x9]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000103 "fmla v16.4s, v30.4s, v12.4s\n"
104 "ldr q24, [x17, %[input_col_stride1]]\n"
105 "fmla v15.4s, v21.4s, v12.4s\n"
106 "prfm pldl1keep, [%[wbptr], #64]\n"
107 "fmla v17.4s, v26.4s, v14.4s\n"
108 "ldr q23, [x16, x26]\n"
109 "fmla v16.4s, v18.4s, v14.4s\n"
110 "subs x25, x25, #1\n"
111 "fmla v15.4s, v27.4s, v14.4s\n"
112 "ldr q26, [x15, x27]\n"
113 "fmla v17.4s, v22.4s, v9.4s\n"
114 "ldr q22, [%[inptr0], x28]\n"
115 "fmla v16.4s, v25.4s, v9.4s\n"
116 "fmla v2.4s, v25.4s, v6.4s\n"
117 "fmla v15.4s, v23.4s, v9.4s\n"
118 "ldr q30, [x19]\n"
119 "fmla v17.4s, v20.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000120 "ldr q29, [x9, %[input_col_stride1]]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000121 "fmla v16.4s, v24.4s, v11.4s\n"
122 "ldr q28, [x17, x26]\n"
123 "fmla v4.4s, v23.4s, v6.4s\n"
124 "fmla v15.4s, v26.4s, v11.4s\n"
125 "fmla v17.4s, v19.4s, v13.4s\n"
126 "ldr q24, [x16, x27]\n"
127 "fmla v16.4s, v23.4s, v13.4s\n"
128 "ldr q25, [x15, x28]\n"
129 "fmla v15.4s, v22.4s, v13.4s\n"
130 "fmla v5.4s, v22.4s, v6.4s\n"
131 "fmla v17.4s, v18.4s, v8.4s\n"
132 "ldr q19, [%[inptr0], x13]\n"
133 "fmla v2.4s, v30.4s, v12.4s\n"
134 "ldr q18, [x20]\n"
135 "fmla v16.4s, v29.4s, v8.4s\n"
136 "ldr q22, [x19, %[input_col_stride1]]\n"
137 "fmla v17.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000138 "ldr q26, [x9, x26]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000139 "fmla v2.4s, v29.4s, v14.4s\n"
140 "ldr q20, [x17, x27]\n"
141 "fmla v16.4s, v28.4s, v10.4s\n"
142 "fmla v4.4s, v28.4s, v12.4s\n"
143 "fmla v17.4s, v23.4s, v7.4s\n"
144 "ldr q27, [x16, x28]\n"
145 "fmla v15.4s, v24.4s, v8.4s\n"
146 "ldr q30, [x15, x13]\n"
147 "fmla v4.4s, v24.4s, v14.4s\n"
148 "ldr q24, [%[inptr0], x14]\n"
149 "str q17, [%[outptr0]]\n"
150 "fmla v5.4s, v25.4s, v12.4s\n"
151 "fmla v15.4s, v25.4s, v10.4s\n"
152 "ldr q28, [x20, %[input_col_stride1]]\n"
153 "fmla v2.4s, v18.4s, v9.4s\n"
154 "ldr q17, [x19, x26]\n"
155 "fmla v5.4s, v19.4s, v14.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000156 "ldr q18, [x9, x27]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000157 "fmla v16.4s, v26.4s, v7.4s\n"
158 "ldr q25, [x17, x28]\n"
159 "fmla v2.4s, v22.4s, v11.4s\n"
160 "ldr q22, [x16, x13]\n"
161 "fmla v4.4s, v26.4s, v9.4s\n"
162 "add %[inptr0], %[inptr0], #16\n"
163 "str q16, [x21]\n"
164 "fmla v1.4s, v26.4s, v6.4s\n"
165 "fmla v2.4s, v26.4s, v13.4s\n"
166 "ldr q21, [x15, x14]\n"
167 "fmla v4.4s, v20.4s, v11.4s\n"
168 "ldr q23, [x20, x26]\n"
169 "fmla v15.4s, v27.4s, v7.4s\n"
170 "ldr q19, [x19, x27]\n"
171 "fmla v5.4s, v27.4s, v9.4s\n"
172 "add x15, x15, #16\n"
173 "fmla v4.4s, v27.4s, v13.4s\n"
174 "fmla v3.4s, v27.4s, v6.4s\n"
175 "str q15, [%[outptr0], %[output_col_stride1]]\n"
176 "fmla v2.4s, v28.4s, v8.4s\n"
177 "fmla v5.4s, v30.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000178 "ldr q29, [x9, x28]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000179 "fmla v1.4s, v17.4s, v12.4s\n"
180 "ldr q27, [x17, x13]\n"
181 "fmla v2.4s, v17.4s, v10.4s\n"
182 "ldr q28, [x16, x14]\n"
183 "fmla v5.4s, v24.4s, v13.4s\n"
184 "ldr q26, [x20, x27]\n"
185 "fmla v4.4s, v18.4s, v8.4s\n"
186 "ldr q20, [x19, x28]\n"
187 "fmla v1.4s, v18.4s, v14.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000188 "ldr q17, [x9, x13]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000189 "fmla v3.4s, v25.4s, v12.4s\n"
190 "ldr q18, [x17, x14]\n"
191 "fmla v4.4s, v25.4s, v10.4s\n"
192 "ldr q16, [x20, x28]\n"
193 "fmla v5.4s, v22.4s, v8.4s\n"
194 "add x16, x16, #16\n"
195 "fmla v3.4s, v22.4s, v14.4s\n"
196 "ldr q15, [x19, x13]\n"
197 "fmla v2.4s, v23.4s, v7.4s\n"
198 "add x17, x17, #16\n"
199 "fmla v5.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000200 "ldr q21, [x9, x14]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000201 "fmla v1.4s, v23.4s, v9.4s\n"
202 "ldr q23, [x20, x13]\n"
203 "str q2, [x22]\n"
204 "fmla v4.4s, v29.4s, v7.4s\n"
205 "fmla v3.4s, v29.4s, v9.4s\n"
206 "ldr q24, [x19, x14]\n"
207 "fmla v1.4s, v19.4s, v11.4s\n"
208 "ldr q25, [x20, x14]\n"
209 "str q4, [x21, %[output_col_stride1]]\n"
210 "fmla v0.4s, v29.4s, v6.4s\n"
211 "fmla v3.4s, v27.4s, v11.4s\n"
212 "ldr q27, [%[wbptr]]\n"
213 "fmla v1.4s, v29.4s, v13.4s\n"
214 "ldr q29, [%[inptr0]]\n"
215 "fmla v5.4s, v28.4s, v7.4s\n"
216 "ldr q6, [%[wbptr], #16]\n"
217 "fmla v3.4s, v28.4s, v13.4s\n"
218 "ldr q28, [x15]\n"
219 "fmla v1.4s, v26.4s, v8.4s\n"
220 "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
221 "str q5, [%[outptr0], x23]\n"
222 "fmla v0.4s, v20.4s, v12.4s\n"
223 "fmla v3.4s, v17.4s, v8.4s\n"
224 "ldr q22, [x16]\n"
225 "fmla v1.4s, v20.4s, v10.4s\n"
226 "ldr q20, [x15, %[input_col_stride1]]\n"
227 "fmla v0.4s, v17.4s, v14.4s\n"
228 "ldr q12, [%[wbptr], #64]\n"
229 "fmla v3.4s, v18.4s, v10.4s\n"
230 "ldr q19, [%[inptr0], x26]\n"
231 "fmla v1.4s, v16.4s, v7.4s\n"
232 "ldr q30, [x17]\n"
233 "fmla v0.4s, v16.4s, v9.4s\n"
234 "ldr q14, [%[wbptr], #32]\n"
235 "fmla v3.4s, v21.4s, v7.4s\n"
236 "ldr q18, [x16, %[input_col_stride1]]\n"
237 "str q1, [x22, %[output_col_stride1]]\n"
238 "mov v17.16b, v27.16b\n"
239 "fmla v0.4s, v15.4s, v11.4s\n"
240 "ldr q9, [%[wbptr], #112]\n"
241 "str q3, [x21, x23]\n"
242 "mov v16.16b, v27.16b\n"
243 "mov v15.16b, v27.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000244 "add x9, x9, #16\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000245 "fmla v0.4s, v21.4s, v13.4s\n"
246 "ldr q11, [%[wbptr], #80]\n"
247 "mov v2.16b, v27.16b\n"
248 "add x19, x19, #16\n"
249 "mov v4.16b, v27.16b\n"
250 "add x20, x20, #16\n"
251 "fmla v0.4s, v23.4s, v8.4s\n"
252 "ldr q13, [%[wbptr], #48]\n"
253 "mov v5.16b, v27.16b\n"
254 "add %[outptr0], %[outptr0], #16\n"
255 "mov v1.16b, v27.16b\n"
256 "add x21, x21, #16\n"
257 "fmla v0.4s, v24.4s, v10.4s\n"
258 "ldr q8, [%[wbptr], #128]\n"
259 "mov v3.16b, v27.16b\n"
260 "fmla v0.4s, v25.4s, v7.4s\n"
261 "ldr q10, [%[wbptr], #96]\n"
262 "str q0, [x22, x23]\n"
263 "mov v0.16b, v27.16b\n"
264 "ldr q7, [%[wbptr], #144]\n"
265 "add x22, x22, #16\n"
266 "bne 2b\n"
267 "3:\n"
268 "fmla v17.4s, v29.4s, v6.4s\n"
269 "ldr q21, [x15, x26]\n"
270 "fmla v16.4s, v22.4s, v6.4s\n"
271 "ldr q27, [%[inptr0], x27]\n"
272 "fmla v15.4s, v19.4s, v6.4s\n"
273 "add %[wbptr], %[wbptr], #160\n"
274 "fmla v17.4s, v28.4s, v12.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000275 "ldr q25, [x9]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000276 "fmla v16.4s, v30.4s, v12.4s\n"
277 "ldr q24, [x17, %[input_col_stride1]]\n"
278 "fmla v15.4s, v21.4s, v12.4s\n"
279 "prfm pldl1keep, [%[wbptr], #64]\n"
280 "fmla v17.4s, v26.4s, v14.4s\n"
281 "ldr q23, [x16, x26]\n"
282 "fmla v16.4s, v18.4s, v14.4s\n"
283 "fmla v2.4s, v25.4s, v6.4s\n"
284 "fmla v15.4s, v27.4s, v14.4s\n"
285 "ldr q26, [x15, x27]\n"
286 "fmla v17.4s, v22.4s, v9.4s\n"
287 "ldr q22, [%[inptr0], x28]\n"
288 "fmla v16.4s, v25.4s, v9.4s\n"
289 "ldr q30, [x19]\n"
290 "fmla v15.4s, v23.4s, v9.4s\n"
291 "fmla v4.4s, v23.4s, v6.4s\n"
292 "fmla v17.4s, v20.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000293 "ldr q29, [x9, %[input_col_stride1]]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000294 "fmla v16.4s, v24.4s, v11.4s\n"
295 "ldr q28, [x17, x26]\n"
296 "fmla v15.4s, v26.4s, v11.4s\n"
297 "ldr q24, [x16, x27]\n"
298 "fmla v17.4s, v19.4s, v13.4s\n"
299 "ldr q25, [x15, x28]\n"
300 "fmla v16.4s, v23.4s, v13.4s\n"
301 "fmla v5.4s, v22.4s, v6.4s\n"
302 "fmla v15.4s, v22.4s, v13.4s\n"
303 "ldr q19, [%[inptr0], x13]\n"
304 "fmla v17.4s, v18.4s, v8.4s\n"
305 "ldr q18, [x20]\n"
306 "fmla v2.4s, v30.4s, v12.4s\n"
307 "ldr q22, [x19, %[input_col_stride1]]\n"
308 "fmla v16.4s, v29.4s, v8.4s\n"
309 "fmla v4.4s, v28.4s, v12.4s\n"
310 "fmla v17.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000311 "ldr q26, [x9, x26]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000312 "fmla v2.4s, v29.4s, v14.4s\n"
313 "ldr q20, [x17, x27]\n"
314 "fmla v16.4s, v28.4s, v10.4s\n"
315 "ldr q27, [x16, x28]\n"
316 "fmla v17.4s, v23.4s, v7.4s\n"
317 "ldr q30, [x15, x13]\n"
318 "fmla v15.4s, v24.4s, v8.4s\n"
319 "fmla v4.4s, v24.4s, v14.4s\n"
320 "fmla v5.4s, v25.4s, v12.4s\n"
321 "ldr q24, [%[inptr0], x14]\n"
322 "str q17, [%[outptr0]]\n"
323 "fmla v2.4s, v18.4s, v9.4s\n"
324 "fmla v15.4s, v25.4s, v10.4s\n"
325 "ldr q28, [x20, %[input_col_stride1]]\n"
326 "fmla v5.4s, v19.4s, v14.4s\n"
327 "ldr q17, [x19, x26]\n"
328 "fmla v2.4s, v22.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000329 "ldr q18, [x9, x27]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000330 "fmla v16.4s, v26.4s, v7.4s\n"
331 "ldr q25, [x17, x28]\n"
332 "fmla v4.4s, v26.4s, v9.4s\n"
333 "ldr q22, [x16, x13]\n"
334 "fmla v2.4s, v26.4s, v13.4s\n"
335 "add %[inptr0], %[inptr0], #16\n"
336 "str q16, [x21]\n"
337 "fmla v1.4s, v26.4s, v6.4s\n"
338 "fmla v4.4s, v20.4s, v11.4s\n"
339 "ldr q21, [x15, x14]\n"
340 "fmla v15.4s, v27.4s, v7.4s\n"
341 "ldr q23, [x20, x26]\n"
342 "fmla v5.4s, v27.4s, v9.4s\n"
343 "ldr q19, [x19, x27]\n"
344 "fmla v4.4s, v27.4s, v13.4s\n"
345 "add x15, x15, #16\n"
346 "str q15, [%[outptr0], %[output_col_stride1]]\n"
347 "fmla v3.4s, v27.4s, v6.4s\n"
348 "fmla v5.4s, v30.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000349 "ldr q29, [x9, x28]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000350 "fmla v2.4s, v28.4s, v8.4s\n"
351 "ldr q27, [x17, x13]\n"
352 "fmla v1.4s, v17.4s, v12.4s\n"
353 "ldr q28, [x16, x14]\n"
354 "fmla v5.4s, v24.4s, v13.4s\n"
355 "ldr q26, [x20, x27]\n"
356 "fmla v2.4s, v17.4s, v10.4s\n"
357 "ldr q20, [x19, x28]\n"
358 "fmla v4.4s, v18.4s, v8.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000359 "ldr q17, [x9, x13]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000360 "fmla v1.4s, v18.4s, v14.4s\n"
361 "ldr q18, [x17, x14]\n"
362 "fmla v3.4s, v25.4s, v12.4s\n"
363 "add x16, x16, #16\n"
364 "fmla v4.4s, v25.4s, v10.4s\n"
365 "ldr q16, [x20, x28]\n"
366 "fmla v5.4s, v22.4s, v8.4s\n"
367 "add x17, x17, #16\n"
368 "fmla v3.4s, v22.4s, v14.4s\n"
369 "ldr q15, [x19, x13]\n"
370 "fmla v2.4s, v23.4s, v7.4s\n"
371 "fmla v1.4s, v23.4s, v9.4s\n"
372 "fmla v5.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000373 "ldr q21, [x9, x14]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000374 "fmla v4.4s, v29.4s, v7.4s\n"
375 "ldr q23, [x20, x13]\n"
376 "str q2, [x22]\n"
377 "fmla v1.4s, v19.4s, v11.4s\n"
378 "fmla v3.4s, v29.4s, v9.4s\n"
379 "ldr q24, [x19, x14]\n"
380 "str q4, [x21, %[output_col_stride1]]\n"
381 "fmla v0.4s, v29.4s, v6.4s\n"
382 "fmla v1.4s, v29.4s, v13.4s\n"
383 "ldr q25, [x20, x14]\n"
384 "fmla v3.4s, v27.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000385 "add x9, x9, #16\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000386 "fmla v5.4s, v28.4s, v7.4s\n"
387 "add x19, x19, #16\n"
388 "fmla v1.4s, v26.4s, v8.4s\n"
389 "add x20, x20, #16\n"
390 "fmla v3.4s, v28.4s, v13.4s\n"
391 "fmla v0.4s, v20.4s, v12.4s\n"
392 "str q5, [%[outptr0], x23]\n"
393 "fmla v1.4s, v20.4s, v10.4s\n"
394 "fmla v3.4s, v17.4s, v8.4s\n"
395 "add %[outptr0], %[outptr0], #16\n"
396 "fmla v0.4s, v17.4s, v14.4s\n"
397 "fmla v1.4s, v16.4s, v7.4s\n"
398 "fmla v3.4s, v18.4s, v10.4s\n"
399 "fmla v0.4s, v16.4s, v9.4s\n"
400 "str q1, [x22, %[output_col_stride1]]\n"
401 "fmla v3.4s, v21.4s, v7.4s\n"
402 "fmla v0.4s, v15.4s, v11.4s\n"
403 "str q3, [x21, x23]\n"
404 "fmla v0.4s, v21.4s, v13.4s\n"
405 "add x21, x21, #16\n"
406 "fmla v0.4s, v23.4s, v8.4s\n"
407 "fmla v0.4s, v24.4s, v10.4s\n"
408 "fmla v0.4s, v25.4s, v7.4s\n"
409 "str q0, [x22, x23]\n"
410 "add x22, x22, #16\n"
411 "4:\n"
412 "cbz x24, 7f\n"
413 "ldr s27, [%[wbptr]]\n"
414 "mov v17.16b, v27.16b\n"
415 "ldr s6, [%[wbptr], #4]\n"
416 "mov v16.16b, v27.16b\n"
417 "ldr s14, [%[wbptr], #8]\n"
418 "mov v15.16b, v27.16b\n"
419 "ldr s13, [%[wbptr], #12]\n"
420 "mov v2.16b, v27.16b\n"
421 "ldr s12, [%[wbptr], #16]\n"
422 "mov v4.16b, v27.16b\n"
423 "ldr s11, [%[wbptr], #20]\n"
424 "mov v5.16b, v27.16b\n"
425 "ldr s10, [%[wbptr], #24]\n"
426 "mov v1.16b, v27.16b\n"
427 "ldr s9, [%[wbptr], #28]\n"
428 "mov v3.16b, v27.16b\n"
429 "ldr s8, [%[wbptr], #32]\n"
430 "mov v0.16b, v27.16b\n"
431 "ldr s7, [%[wbptr], #36]\n"
432 "ldr s29, [%[inptr0]]\n"
433 "subs x24, x24, #1\n"
434 "ldr s28, [x15]\n"
435 "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
436 "ldr s22, [x16]\n"
437 "ldr s20, [x15, %[input_col_stride1]]\n"
438 "ldr s19, [%[inptr0], x26]\n"
439 "ldr s30, [x17]\n"
440 "ldr s18, [x16, %[input_col_stride1]]\n"
441 "beq 6f\n"
442 "5:\n"
443 "fmla v17.4s, v29.4s, v6.4s\n"
444 "ldr s21, [x15, x26]\n"
445 "fmla v16.4s, v22.4s, v6.4s\n"
446 "ldr s27, [%[inptr0], x27]\n"
447 "fmla v15.4s, v19.4s, v6.4s\n"
448 "add %[wbptr], %[wbptr], #40\n"
449 "fmla v17.4s, v28.4s, v12.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000450 "ldr s25, [x9]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000451 "fmla v16.4s, v30.4s, v12.4s\n"
452 "ldr s24, [x17, %[input_col_stride1]]\n"
453 "fmla v15.4s, v21.4s, v12.4s\n"
454 "prfm pldl1keep, [%[wbptr], #64]\n"
455 "fmla v17.4s, v26.4s, v14.4s\n"
456 "ldr s23, [x16, x26]\n"
457 "fmla v16.4s, v18.4s, v14.4s\n"
458 "subs x24, x24, #1\n"
459 "fmla v15.4s, v27.4s, v14.4s\n"
460 "ldr s26, [x15, x27]\n"
461 "fmla v17.4s, v22.4s, v9.4s\n"
462 "ldr s22, [%[inptr0], x28]\n"
463 "fmla v16.4s, v25.4s, v9.4s\n"
464 "fmla v2.4s, v25.4s, v6.4s\n"
465 "fmla v15.4s, v23.4s, v9.4s\n"
466 "ldr s30, [x19]\n"
467 "fmla v17.4s, v20.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000468 "ldr s29, [x9, %[input_col_stride1]]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000469 "fmla v16.4s, v24.4s, v11.4s\n"
470 "ldr s28, [x17, x26]\n"
471 "fmla v4.4s, v23.4s, v6.4s\n"
472 "fmla v15.4s, v26.4s, v11.4s\n"
473 "fmla v17.4s, v19.4s, v13.4s\n"
474 "ldr s24, [x16, x27]\n"
475 "fmla v16.4s, v23.4s, v13.4s\n"
476 "ldr s25, [x15, x28]\n"
477 "fmla v15.4s, v22.4s, v13.4s\n"
478 "fmla v5.4s, v22.4s, v6.4s\n"
479 "fmla v17.4s, v18.4s, v8.4s\n"
480 "ldr s19, [%[inptr0], x13]\n"
481 "fmla v2.4s, v30.4s, v12.4s\n"
482 "ldr s18, [x20]\n"
483 "fmla v16.4s, v29.4s, v8.4s\n"
484 "ldr s22, [x19, %[input_col_stride1]]\n"
485 "fmla v17.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000486 "ldr s26, [x9, x26]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000487 "fmla v2.4s, v29.4s, v14.4s\n"
488 "ldr s20, [x17, x27]\n"
489 "fmla v16.4s, v28.4s, v10.4s\n"
490 "fmla v4.4s, v28.4s, v12.4s\n"
491 "fmla v17.4s, v23.4s, v7.4s\n"
492 "ldr s27, [x16, x28]\n"
493 "fmla v15.4s, v24.4s, v8.4s\n"
494 "ldr s30, [x15, x13]\n"
495 "fmla v4.4s, v24.4s, v14.4s\n"
496 "ldr s24, [%[inptr0], x14]\n"
497 "str s17, [%[outptr0]]\n"
498 "fmla v5.4s, v25.4s, v12.4s\n"
499 "fmla v15.4s, v25.4s, v10.4s\n"
500 "ldr s28, [x20, %[input_col_stride1]]\n"
501 "fmla v2.4s, v18.4s, v9.4s\n"
502 "ldr s17, [x19, x26]\n"
503 "fmla v5.4s, v19.4s, v14.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000504 "ldr s18, [x9, x27]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000505 "fmla v16.4s, v26.4s, v7.4s\n"
506 "ldr s25, [x17, x28]\n"
507 "fmla v2.4s, v22.4s, v11.4s\n"
508 "ldr s22, [x16, x13]\n"
509 "fmla v4.4s, v26.4s, v9.4s\n"
510 "add %[inptr0], %[inptr0], #4\n"
511 "str s16, [x21]\n"
512 "fmla v1.4s, v26.4s, v6.4s\n"
513 "fmla v2.4s, v26.4s, v13.4s\n"
514 "ldr s21, [x15, x14]\n"
515 "fmla v4.4s, v20.4s, v11.4s\n"
516 "ldr s23, [x20, x26]\n"
517 "fmla v15.4s, v27.4s, v7.4s\n"
518 "ldr s19, [x19, x27]\n"
519 "fmla v5.4s, v27.4s, v9.4s\n"
520 "add x15, x15, #4\n"
521 "fmla v4.4s, v27.4s, v13.4s\n"
522 "fmla v3.4s, v27.4s, v6.4s\n"
523 "str s15, [%[outptr0], %[output_col_stride1]]\n"
524 "fmla v2.4s, v28.4s, v8.4s\n"
525 "fmla v5.4s, v30.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000526 "ldr s29, [x9, x28]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000527 "fmla v1.4s, v17.4s, v12.4s\n"
528 "ldr s27, [x17, x13]\n"
529 "fmla v2.4s, v17.4s, v10.4s\n"
530 "ldr s28, [x16, x14]\n"
531 "fmla v5.4s, v24.4s, v13.4s\n"
532 "ldr s26, [x20, x27]\n"
533 "fmla v4.4s, v18.4s, v8.4s\n"
534 "ldr s20, [x19, x28]\n"
535 "fmla v1.4s, v18.4s, v14.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000536 "ldr s17, [x9, x13]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000537 "fmla v3.4s, v25.4s, v12.4s\n"
538 "ldr s18, [x17, x14]\n"
539 "fmla v4.4s, v25.4s, v10.4s\n"
540 "ldr s16, [x20, x28]\n"
541 "fmla v5.4s, v22.4s, v8.4s\n"
542 "add x16, x16, #4\n"
543 "fmla v3.4s, v22.4s, v14.4s\n"
544 "ldr s15, [x19, x13]\n"
545 "fmla v2.4s, v23.4s, v7.4s\n"
546 "add x17, x17, #4\n"
547 "fmla v5.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000548 "ldr s21, [x9, x14]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000549 "fmla v1.4s, v23.4s, v9.4s\n"
550 "ldr s23, [x20, x13]\n"
551 "str s2, [x22]\n"
552 "fmla v4.4s, v29.4s, v7.4s\n"
553 "fmla v3.4s, v29.4s, v9.4s\n"
554 "ldr s24, [x19, x14]\n"
555 "fmla v1.4s, v19.4s, v11.4s\n"
556 "ldr s25, [x20, x14]\n"
557 "str s4, [x21, %[output_col_stride1]]\n"
558 "fmla v0.4s, v29.4s, v6.4s\n"
559 "fmla v3.4s, v27.4s, v11.4s\n"
560 "ldr s27, [%[wbptr]]\n"
561 "fmla v1.4s, v29.4s, v13.4s\n"
562 "ldr s29, [%[inptr0]]\n"
563 "fmla v5.4s, v28.4s, v7.4s\n"
564 "ldr s6, [%[wbptr], #4]\n"
565 "fmla v3.4s, v28.4s, v13.4s\n"
566 "ldr s28, [x15]\n"
567 "fmla v1.4s, v26.4s, v8.4s\n"
568 "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
569 "str s5, [%[outptr0], x23]\n"
570 "fmla v0.4s, v20.4s, v12.4s\n"
571 "fmla v3.4s, v17.4s, v8.4s\n"
572 "ldr s22, [x16]\n"
573 "fmla v1.4s, v20.4s, v10.4s\n"
574 "ldr s20, [x15, %[input_col_stride1]]\n"
575 "fmla v0.4s, v17.4s, v14.4s\n"
576 "ldr s12, [%[wbptr], #16]\n"
577 "fmla v3.4s, v18.4s, v10.4s\n"
578 "ldr s19, [%[inptr0], x26]\n"
579 "fmla v1.4s, v16.4s, v7.4s\n"
580 "ldr s30, [x17]\n"
581 "fmla v0.4s, v16.4s, v9.4s\n"
582 "ldr s14, [%[wbptr], #8]\n"
583 "fmla v3.4s, v21.4s, v7.4s\n"
584 "ldr s18, [x16, %[input_col_stride1]]\n"
585 "str s1, [x22, %[output_col_stride1]]\n"
586 "mov v17.16b, v27.16b\n"
587 "fmla v0.4s, v15.4s, v11.4s\n"
588 "ldr s9, [%[wbptr], #28]\n"
589 "str s3, [x21, x23]\n"
590 "mov v16.16b, v27.16b\n"
591 "mov v15.16b, v27.16b\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000592 "add x9, x9, #4\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000593 "fmla v0.4s, v21.4s, v13.4s\n"
594 "ldr s11, [%[wbptr], #20]\n"
595 "mov v2.16b, v27.16b\n"
596 "add x19, x19, #4\n"
597 "mov v4.16b, v27.16b\n"
598 "add x20, x20, #4\n"
599 "fmla v0.4s, v23.4s, v8.4s\n"
600 "ldr s13, [%[wbptr], #12]\n"
601 "mov v5.16b, v27.16b\n"
602 "add %[outptr0], %[outptr0], #4\n"
603 "mov v1.16b, v27.16b\n"
604 "add x21, x21, #4\n"
605 "fmla v0.4s, v24.4s, v10.4s\n"
606 "ldr s8, [%[wbptr], #32]\n"
607 "mov v3.16b, v27.16b\n"
608 "fmla v0.4s, v25.4s, v7.4s\n"
609 "ldr s10, [%[wbptr], #24]\n"
610 "str s0, [x22, x23]\n"
611 "mov v0.16b, v27.16b\n"
612 "ldr s7, [%[wbptr], #36]\n"
613 "add x22, x22, #4\n"
614 "bne 5b\n"
615 "6:\n"
616 "fmla v17.4s, v29.4s, v6.4s\n"
617 "ldr s21, [x15, x26]\n"
618 "fmla v16.4s, v22.4s, v6.4s\n"
619 "ldr s27, [%[inptr0], x27]\n"
620 "fmla v15.4s, v19.4s, v6.4s\n"
621 "add %[wbptr], %[wbptr], #40\n"
622 "fmla v17.4s, v28.4s, v12.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000623 "ldr s25, [x9]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000624 "fmla v16.4s, v30.4s, v12.4s\n"
625 "ldr s24, [x17, %[input_col_stride1]]\n"
626 "fmla v15.4s, v21.4s, v12.4s\n"
627 "prfm pldl1keep, [%[wbptr], #64]\n"
628 "fmla v17.4s, v26.4s, v14.4s\n"
629 "ldr s23, [x16, x26]\n"
630 "fmla v16.4s, v18.4s, v14.4s\n"
631 "fmla v2.4s, v25.4s, v6.4s\n"
632 "fmla v15.4s, v27.4s, v14.4s\n"
633 "ldr s26, [x15, x27]\n"
634 "fmla v17.4s, v22.4s, v9.4s\n"
635 "ldr s22, [%[inptr0], x28]\n"
636 "fmla v16.4s, v25.4s, v9.4s\n"
637 "ldr s30, [x19]\n"
638 "fmla v15.4s, v23.4s, v9.4s\n"
639 "fmla v4.4s, v23.4s, v6.4s\n"
640 "fmla v17.4s, v20.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000641 "ldr s29, [x9, %[input_col_stride1]]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000642 "fmla v16.4s, v24.4s, v11.4s\n"
643 "ldr s28, [x17, x26]\n"
644 "fmla v15.4s, v26.4s, v11.4s\n"
645 "ldr s24, [x16, x27]\n"
646 "fmla v17.4s, v19.4s, v13.4s\n"
647 "ldr s25, [x15, x28]\n"
648 "fmla v16.4s, v23.4s, v13.4s\n"
649 "fmla v5.4s, v22.4s, v6.4s\n"
650 "fmla v15.4s, v22.4s, v13.4s\n"
651 "ldr s19, [%[inptr0], x13]\n"
652 "fmla v17.4s, v18.4s, v8.4s\n"
653 "ldr s18, [x20]\n"
654 "fmla v2.4s, v30.4s, v12.4s\n"
655 "ldr s22, [x19, %[input_col_stride1]]\n"
656 "fmla v16.4s, v29.4s, v8.4s\n"
657 "fmla v4.4s, v28.4s, v12.4s\n"
658 "fmla v17.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000659 "ldr s26, [x9, x26]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000660 "fmla v2.4s, v29.4s, v14.4s\n"
661 "ldr s20, [x17, x27]\n"
662 "fmla v16.4s, v28.4s, v10.4s\n"
663 "ldr s27, [x16, x28]\n"
664 "fmla v17.4s, v23.4s, v7.4s\n"
665 "ldr s30, [x15, x13]\n"
666 "fmla v15.4s, v24.4s, v8.4s\n"
667 "fmla v4.4s, v24.4s, v14.4s\n"
668 "fmla v5.4s, v25.4s, v12.4s\n"
669 "ldr s24, [%[inptr0], x14]\n"
670 "str s17, [%[outptr0]]\n"
671 "fmla v2.4s, v18.4s, v9.4s\n"
672 "fmla v15.4s, v25.4s, v10.4s\n"
673 "ldr s28, [x20, %[input_col_stride1]]\n"
674 "fmla v5.4s, v19.4s, v14.4s\n"
675 "ldr s17, [x19, x26]\n"
676 "fmla v2.4s, v22.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000677 "ldr s18, [x9, x27]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000678 "fmla v16.4s, v26.4s, v7.4s\n"
679 "ldr s25, [x17, x28]\n"
680 "fmla v4.4s, v26.4s, v9.4s\n"
681 "ldr s22, [x16, x13]\n"
682 "fmla v2.4s, v26.4s, v13.4s\n"
683 "add %[inptr0], %[inptr0], #4\n"
684 "str s16, [x21]\n"
685 "fmla v1.4s, v26.4s, v6.4s\n"
686 "fmla v4.4s, v20.4s, v11.4s\n"
687 "ldr s21, [x15, x14]\n"
688 "fmla v15.4s, v27.4s, v7.4s\n"
689 "ldr s23, [x20, x26]\n"
690 "fmla v5.4s, v27.4s, v9.4s\n"
691 "ldr s19, [x19, x27]\n"
692 "fmla v4.4s, v27.4s, v13.4s\n"
693 "add x15, x15, #4\n"
694 "str s15, [%[outptr0], %[output_col_stride1]]\n"
695 "fmla v3.4s, v27.4s, v6.4s\n"
696 "fmla v5.4s, v30.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000697 "ldr s29, [x9, x28]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000698 "fmla v2.4s, v28.4s, v8.4s\n"
699 "ldr s27, [x17, x13]\n"
700 "fmla v1.4s, v17.4s, v12.4s\n"
701 "ldr s28, [x16, x14]\n"
702 "fmla v5.4s, v24.4s, v13.4s\n"
703 "ldr s26, [x20, x27]\n"
704 "fmla v2.4s, v17.4s, v10.4s\n"
705 "ldr s20, [x19, x28]\n"
706 "fmla v4.4s, v18.4s, v8.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000707 "ldr s17, [x9, x13]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000708 "fmla v1.4s, v18.4s, v14.4s\n"
709 "ldr s18, [x17, x14]\n"
710 "fmla v3.4s, v25.4s, v12.4s\n"
711 "add x16, x16, #4\n"
712 "fmla v4.4s, v25.4s, v10.4s\n"
713 "ldr s16, [x20, x28]\n"
714 "fmla v5.4s, v22.4s, v8.4s\n"
715 "add x17, x17, #4\n"
716 "fmla v3.4s, v22.4s, v14.4s\n"
717 "ldr s15, [x19, x13]\n"
718 "fmla v2.4s, v23.4s, v7.4s\n"
719 "fmla v1.4s, v23.4s, v9.4s\n"
720 "fmla v5.4s, v21.4s, v10.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000721 "ldr s21, [x9, x14]\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000722 "fmla v4.4s, v29.4s, v7.4s\n"
723 "ldr s23, [x20, x13]\n"
724 "str s2, [x22]\n"
725 "fmla v1.4s, v19.4s, v11.4s\n"
726 "fmla v3.4s, v29.4s, v9.4s\n"
727 "ldr s24, [x19, x14]\n"
728 "str s4, [x21, %[output_col_stride1]]\n"
729 "fmla v0.4s, v29.4s, v6.4s\n"
730 "fmla v1.4s, v29.4s, v13.4s\n"
731 "ldr s25, [x20, x14]\n"
732 "fmla v3.4s, v27.4s, v11.4s\n"
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000733 "add x9, x9, #4\n"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000734 "fmla v5.4s, v28.4s, v7.4s\n"
735 "add x19, x19, #4\n"
736 "fmla v1.4s, v26.4s, v8.4s\n"
737 "add x20, x20, #4\n"
738 "fmla v3.4s, v28.4s, v13.4s\n"
739 "fmla v0.4s, v20.4s, v12.4s\n"
740 "str s5, [%[outptr0], x23]\n"
741 "fmla v1.4s, v20.4s, v10.4s\n"
742 "fmla v3.4s, v17.4s, v8.4s\n"
743 "add %[outptr0], %[outptr0], #4\n"
744 "fmla v0.4s, v17.4s, v14.4s\n"
745 "fmla v1.4s, v16.4s, v7.4s\n"
746 "fmla v3.4s, v18.4s, v10.4s\n"
747 "fmla v0.4s, v16.4s, v9.4s\n"
748 "str s1, [x22, %[output_col_stride1]]\n"
749 "fmla v3.4s, v21.4s, v7.4s\n"
750 "fmla v0.4s, v15.4s, v11.4s\n"
751 "str s3, [x21, x23]\n"
752 "fmla v0.4s, v21.4s, v13.4s\n"
753 "add x21, x21, #4\n"
754 "fmla v0.4s, v23.4s, v8.4s\n"
755 "fmla v0.4s, v24.4s, v10.4s\n"
756 "fmla v0.4s, v25.4s, v7.4s\n"
757 "str s0, [x22, x23]\n"
758 "add x22, x22, #4\n"
759 "7:\n"
760 : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
761 : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
Georgios Pinitasce3a7b22020-03-10 15:33:57 +0000762 : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000763 );
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000764}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000765#endif // __aarch64__
766
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000767template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000768
Georgios Pinitas4074c992018-01-30 18:13:46 +0000769} // namespace depthwise