blob: dc77d0c45025f2eafbc18c2bf629a439f5b744d0 [file] [log] [blame]
Gunes Bayirae72a462023-01-29 13:24:24 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
Matthew Benthamf1aeab92023-05-30 13:35:34 +000025#include "arm_compute/core/ActivationLayerInfo.h"
Gunes Bayirae72a462023-01-29 13:24:24 +000026#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/ITensor.h"
28#include "arm_compute/core/QuantizationInfo.h"
29#include "arm_compute/core/Types.h"
30#include "arm_compute/core/Window.h"
31
32#include <cstddef>
33#include <cstdint>
34#include <limits>
35
36#ifdef __aarch64__
37namespace
38{
39void a64_add_bn_clamp_direct_u8_fp32_2x16(
40 uint8_t *out, size_t out_stride,
41 uint8_t *out_direct, size_t out_direct_stride,
42 const uint8_t *in0, size_t in0_stride,
43 const uint8_t *in1, size_t in1_stride,
44 const float *bn_mul,
45 const float *bn_add,
46 const uint8_t minval,
47 const uint8_t maxval,
48 int32_t out_zeropt, float out_scale,
49 int32_t out_direct_zeropt, float out_direct_scale,
50 int32_t in0_zeropt, float in0_scale,
51 int32_t in1_zeropt, float in1_scale,
52 size_t width, size_t height)
53{
54 float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
55 struct KernelArgs
56 {
57 const float *scales;
58 int32_t in0_zeropt;
59 int32_t in1_zeropt;
60 int32_t out_zeropt;
61 int32_t out_direct_zeropt;
62 int32_t minval;
63 int32_t maxval;
64 } ka;
65 ka.scales = scales;
66 ka.in0_zeropt = in0_zeropt;
67 ka.in1_zeropt = in1_zeropt;
68 ka.out_zeropt = out_zeropt;
69 ka.out_direct_zeropt = out_direct_zeropt;
70 ka.minval = minval;
71 ka.maxval = maxval;
72
73 __asm__ __volatile__(
74 "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
75 "ld1 { v0.4s }, [x20]\n"
76 "cmp %x[width], #0x10\n"
77 "blt 5f\n"
78 "1:" // Column loop
79 "ldr q24, [%x[bn_mul], #0x0]\n"
80 "ldr q25, [%x[bn_mul], #0x10]\n"
81 "mov x23, %x[height]\n"
82 "mov x12, %x[in0]\n"
83 "ldr q26, [%x[bn_mul], #0x20]\n"
84 "ldr q27, [%x[bn_mul], #0x30]\n"
85 "mov x11, %x[in1]\n"
86 "mov x10, %x[out]\n"
87 "ldr q28, [%x[bn_add], #0x0]\n"
88 "ldr q29, [%x[bn_add], #0x10]\n"
89 "mov x9, %x[out_direct]\n"
90 "add %x[bn_mul], %x[bn_mul], #0x40\n"
91 "ldr q30, [%x[bn_add], #0x20]\n"
92 "ldr q31, [%x[bn_add], #0x30]\n"
93 "add %x[bn_add], %x[bn_add], #0x40\n"
94 "2:" // Row loop
95 "mov x28, x12\n"
96 "ldr d4, [x28, #0x0]\n"
97 "ldr d3, [x28, #0x8]\n"
98 "add x21, x28, %x[in0_stride]\n"
99 "mov x27, x11\n"
100 "ldr d13, [x27, #0x0]\n"
101 "ldr d12, [x27, #0x8]\n"
102 "cmp x23, #0x2\n"
103 "add x12, x21, %x[in0_stride]\n"
104 "csel x21, x21, x28, GE\n"
105 "ldr d2, [x21, #0x0]\n"
106 "ldr d11, [x21, #0x8]\n"
107 "add x20, x27, %x[in1_stride]\n"
108 "add x11, x20, %x[in1_stride]\n"
109 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
110 "ushll v4.8h, v4.8b, #0x0\n"
111 "csel x20, x20, x27, GE\n"
112 "ldr d10, [x20, #0x0]\n"
113 "ldr d9, [x20, #0x8]\n"
114 "ushll v3.8h, v3.8b, #0x0\n"
115 "ushll v2.8h, v2.8b, #0x0\n"
116 "ushll v11.8h, v11.8b, #0x0\n"
117 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
118 "mov x26, x10\n"
119 "dup v16.8h, w21\n"
120 "ushll v13.8h, v13.8b, #0x0\n"
121 "mov x25, x9\n"
122 "add x24, x26, %x[out_stride]\n"
123 "ushll v12.8h, v12.8b, #0x0\n"
124 "ushll v10.8h, v10.8b, #0x0\n"
125 "add x22, x25, %x[out_direct_stride]\n"
126 "add x10, x24, %x[out_stride]\n"
127 "ushll v9.8h, v9.8b, #0x0\n"
128 "ssubl v1.4s, v4.4h, v16.4h\n"
129 "add x9, x22, %x[out_direct_stride]\n"
130 "csel x24, x24, x26, GE\n"
131 "ssubl2 v4.4s, v4.8h, v16.8h\n"
132 "ssubl v23.4s, v3.4h, v16.4h\n"
133 "csel x22, x22, x25, GE\n"
134 "ssubl2 v3.4s, v3.8h, v16.8h\n"
135 "ssubl v22.4s, v2.4h, v16.4h\n"
136 "ssubl2 v2.4s, v2.8h, v16.8h\n"
137 "ssubl v21.4s, v11.4h, v16.4h\n"
138 "ssubl2 v11.4s, v11.8h, v16.8h\n"
139 "dup v20.8h, w20\n"
140 "ssubl v19.4s, v13.4h, v20.4h\n"
141 "ssubl2 v13.4s, v13.8h, v20.8h\n"
142 "ssubl v18.4s, v12.4h, v20.4h\n"
143 "ssubl2 v12.4s, v12.8h, v20.8h\n"
144 "ssubl v17.4s, v10.4h, v20.4h\n"
145 "ssubl2 v10.4s, v10.8h, v20.8h\n"
146 "ssubl v16.4s, v9.4h, v20.4h\n"
147 "ssubl2 v9.4s, v9.8h, v20.8h\n"
148 "scvtf v8.4s, v1.4s\n"
149 "scvtf v7.4s, v4.4s\n"
150 "scvtf v6.4s, v23.4s\n"
151 "scvtf v5.4s, v3.4s\n"
152 "scvtf v4.4s, v22.4s\n"
153 "scvtf v3.4s, v2.4s\n"
154 "scvtf v2.4s, v21.4s\n"
155 "scvtf v1.4s, v11.4s\n"
156 "scvtf v19.4s, v19.4s\n"
157 "fmul v8.4s, v8.4s, v0.s[0]\n"
158 "fmla v8.4s, v19.4s, v0.s[1]\n"
159 "scvtf v13.4s, v13.4s\n"
160 "fmul v7.4s, v7.4s, v0.s[0]\n"
161 "fmla v7.4s, v13.4s, v0.s[1]\n"
162 "scvtf v18.4s, v18.4s\n"
163 "fmul v6.4s, v6.4s, v0.s[0]\n"
164 "fmla v6.4s, v18.4s, v0.s[1]\n"
165 "scvtf v12.4s, v12.4s\n"
166 "fmul v5.4s, v5.4s, v0.s[0]\n"
167 "fmla v5.4s, v12.4s, v0.s[1]\n"
168 "scvtf v17.4s, v17.4s\n"
169 "fmul v4.4s, v4.4s, v0.s[0]\n"
170 "fmla v4.4s, v17.4s, v0.s[1]\n"
171 "scvtf v10.4s, v10.4s\n"
172 "fmul v3.4s, v3.4s, v0.s[0]\n"
173 "fmla v3.4s, v10.4s, v0.s[1]\n"
174 "scvtf v16.4s, v16.4s\n"
175 "fmul v2.4s, v2.4s, v0.s[0]\n"
176 "fmla v2.4s, v16.4s, v0.s[1]\n"
177 "scvtf v9.4s, v9.4s\n"
178 "fmul v1.4s, v1.4s, v0.s[0]\n"
179 "fmla v1.4s, v9.4s, v0.s[1]\n"
180 "cbz %x[out_direct], 3f\n"
181 "fmul v23.4s, v8.4s, v0.s[3]\n"
182 "fmul v22.4s, v7.4s, v0.s[3]\n"
183 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
184 "fmul v21.4s, v6.4s, v0.s[3]\n"
185 "fmul v20.4s, v5.4s, v0.s[3]\n"
186 "fmul v19.4s, v4.4s, v0.s[3]\n"
187 "fmul v18.4s, v3.4s, v0.s[3]\n"
188 "fmul v16.4s, v2.4s, v0.s[3]\n"
189 "fmul v17.4s, v1.4s, v0.s[3]\n"
190 "fcvtas v23.4s, v23.4s\n"
191 "fcvtas v22.4s, v22.4s\n"
192 "fcvtas v21.4s, v21.4s\n"
193 "fcvtas v20.4s, v20.4s\n"
194 "fcvtas v19.4s, v19.4s\n"
195 "fcvtas v18.4s, v18.4s\n"
196 "fcvtas v16.4s, v16.4s\n"
197 "fcvtas v17.4s, v17.4s\n"
198 "uzp1 v22.8h, v23.8h, v22.8h\n"
199 "uzp1 v20.8h, v21.8h, v20.8h\n"
200 "uzp1 v18.8h, v19.8h, v18.8h\n"
201 "uzp1 v17.8h, v16.8h, v17.8h\n"
202 "dup v16.8h, w20\n"
203 "add v22.8h, v22.8h, v16.8h\n"
204 "add v20.8h, v20.8h, v16.8h\n"
205 "add v18.8h, v18.8h, v16.8h\n"
206 "add v17.8h, v17.8h, v16.8h\n"
207 "movi v16.8h, #0xff\n"
208 "smin v22.8h, v22.8h, v16.8h\n"
209 "smin v20.8h, v20.8h, v16.8h\n"
210 "smin v18.8h, v18.8h, v16.8h\n"
211 "smin v17.8h, v17.8h, v16.8h\n"
212 "movi v16.8h, #0x0\n"
213 "smax v22.8h, v22.8h, v16.8h\n"
214 "smax v20.8h, v20.8h, v16.8h\n"
215 "smax v18.8h, v18.8h, v16.8h\n"
216 "smax v17.8h, v17.8h, v16.8h\n"
217 "xtn v22.8b, v22.8h\n"
218 "str d22, [x25, #0x0]\n"
219 "xtn v20.8b, v20.8h\n"
220 "xtn v18.8b, v18.8h\n"
221 "str d20, [x25, #0x8]\n"
222 "xtn v17.8b, v17.8h\n"
223 "str d18, [x22, #0x0]\n"
224 "str d17, [x22, #0x8]\n"
225 "3:" // Main loop: No direct output
226 "mov v19.16b, v28.16b\n"
227 "mov v13.16b, v29.16b\n"
228 "fmla v19.4s, v8.4s, v24.4s\n"
229 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
230 "mov v18.16b, v30.16b\n"
231 "mov v12.16b, v31.16b\n"
232 "fmla v13.4s, v7.4s, v25.4s\n"
233 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
234 "mov v17.16b, v28.16b\n"
235 "mov v10.16b, v29.16b\n"
236 "fmla v18.4s, v6.4s, v26.4s\n"
237 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
238 "mov v16.16b, v30.16b\n"
239 "mov v9.16b, v31.16b\n"
240 "fmla v12.4s, v5.4s, v27.4s\n"
241 "subs x23, x23, #0x2\n"
242 "fmla v17.4s, v4.4s, v24.4s\n"
243 "fmla v10.4s, v3.4s, v25.4s\n"
244 "fmul v8.4s, v19.4s, v0.s[2]\n"
245 "fmla v16.4s, v2.4s, v26.4s\n"
246 "fmla v9.4s, v1.4s, v27.4s\n"
247 "fmul v7.4s, v13.4s, v0.s[2]\n"
248 "fmul v6.4s, v18.4s, v0.s[2]\n"
249 "fmul v5.4s, v12.4s, v0.s[2]\n"
250 "fmul v4.4s, v17.4s, v0.s[2]\n"
251 "fmul v3.4s, v10.4s, v0.s[2]\n"
252 "fmul v2.4s, v16.4s, v0.s[2]\n"
253 "fmul v1.4s, v9.4s, v0.s[2]\n"
254 "fcvtas v8.4s, v8.4s\n"
255 "fcvtas v7.4s, v7.4s\n"
256 "fcvtas v6.4s, v6.4s\n"
257 "fcvtas v5.4s, v5.4s\n"
258 "fcvtas v4.4s, v4.4s\n"
259 "fcvtas v3.4s, v3.4s\n"
260 "fcvtas v2.4s, v2.4s\n"
261 "fcvtas v1.4s, v1.4s\n"
262 "uzp1 v7.8h, v8.8h, v7.8h\n"
263 "uzp1 v5.8h, v6.8h, v5.8h\n"
264 "uzp1 v3.8h, v4.8h, v3.8h\n"
265 "uzp1 v1.8h, v2.8h, v1.8h\n"
266 "dup v16.8h, w22\n"
267 "add v7.8h, v7.8h, v16.8h\n"
268 "add v5.8h, v5.8h, v16.8h\n"
269 "add v3.8h, v3.8h, v16.8h\n"
270 "add v1.8h, v1.8h, v16.8h\n"
271 "dup v16.8h, w21\n"
272 "smin v7.8h, v7.8h, v16.8h\n"
273 "smin v5.8h, v5.8h, v16.8h\n"
274 "smin v3.8h, v3.8h, v16.8h\n"
275 "smin v1.8h, v1.8h, v16.8h\n"
276 "dup v16.8h, w20\n"
277 "smax v7.8h, v7.8h, v16.8h\n"
278 "smax v5.8h, v5.8h, v16.8h\n"
279 "smax v3.8h, v3.8h, v16.8h\n"
280 "smax v1.8h, v1.8h, v16.8h\n"
281 "xtn v7.8b, v7.8h\n"
282 "str d7, [x26, #0x0]\n"
283 "xtn v5.8b, v5.8h\n"
284 "xtn v3.8b, v3.8h\n"
285 "str d5, [x26, #0x8]\n"
286 "xtn v1.8b, v1.8h\n"
287 "str d3, [x24, #0x0]\n"
288 "str d1, [x24, #0x8]\n"
289 "bgt 2b\n"
290 "add %x[in0], %x[in0], #0x10\n"
291 "add %x[in1], %x[in1], #0x10\n"
292 "add %x[out], %x[out], #0x10\n"
293 "cbz %x[out_direct], 4f\n"
294 "add %x[out_direct], %x[out_direct], #0x10\n"
295 "4:" // No direct pointer update
296 "sub %x[width], %x[width], #0x10\n"
297 "cmp %x[width], #0x10\n"
298 "bge 1b\n"
299 "cbz %x[width], 32f\n"
300 "5:" // main loop skip
301 "ldr q24, [%x[bn_mul], #0x0]\n"
302 "ldr q25, [%x[bn_mul], #0x10]\n"
303 "mov x23, %x[height]\n"
304 "mov x12, %x[in0]\n"
305 "ldr q26, [%x[bn_mul], #0x20]\n"
306 "ldr q27, [%x[bn_mul], #0x30]\n"
307 "mov x11, %x[in1]\n"
308 "mov x10, %x[out]\n"
309 "ldr q28, [%x[bn_add], #0x0]\n"
310 "ldr q29, [%x[bn_add], #0x10]\n"
311 "mov x9, %x[out_direct]\n"
312 "add %x[bn_mul], %x[bn_mul], #0x40\n"
313 "ldr q30, [%x[bn_add], #0x20]\n"
314 "ldr q31, [%x[bn_add], #0x30]\n"
315 "add %x[bn_add], %x[bn_add], #0x40\n"
316 "6:" // tail loop: Row loop
317 "mov x28, x12\n"
318 "mov x27, x11\n"
319 "mov x26, x10\n"
320 "mov x25, x9\n"
321 "add x21, x28, %x[in0_stride]\n"
322 "add x20, x27, %x[in1_stride]\n"
323 "add x24, x26, %x[out_stride]\n"
324 "add x22, x25, %x[out_direct_stride]\n"
325 "cmp x23, #0x2\n"
326 "add x12, x21, %x[in0_stride]\n"
327 "add x11, x20, %x[in1_stride]\n"
328 "add x10, x24, %x[out_stride]\n"
329 "add x9, x22, %x[out_direct_stride]\n"
330 "csel x21, x21, x28, GE\n"
331 "csel x20, x20, x27, GE\n"
332 "csel x24, x24, x26, GE\n"
333 "csel x22, x22, x25, GE\n"
334 "tbz %x[width], #3, 10f\n"
335 "ldr d4, [x28, #0x0]\n"
336 "ldr d13, [x27, #0x0]\n"
337 "add x28, x28, #0x8\n"
338 "add x27, x27, #0x8\n"
339 "ldr d2, [x21, #0x0]\n"
340 "ldr d10, [x20, #0x0]\n"
341 "add x21, x21, #0x8\n"
342 "add x20, x20, #0x8\n"
343 "tbz %x[width], #2, 8f\n"
344 "ldr s3, [x28], #0x4\n"
345 "ldr s12, [x27], #0x4\n"
346 "ldr s11, [x21], #0x4\n"
347 "ldr s9, [x20], #0x4\n"
348 "tbz %x[width], #1, 7f\n"
349 "ld1 { v3.h }[2], [x28], #0x2\n"
350 "ld1 { v12.h }[2], [x27], #0x2\n"
351 "ld1 { v11.h }[2], [x21], #0x2\n"
352 "ld1 { v9.h }[2], [x20], #0x2\n"
353 "tbz %x[width], #0, 14f\n"
354 "ld1 { v3.b }[6], [x28], #0x1\n"
355 "ld1 { v12.b }[6], [x27], #0x1\n"
356 "ld1 { v11.b }[6], [x21], #0x1\n"
357 "ld1 { v9.b }[6], [x20], #0x1\n"
358 "b 14f\n"
359 "7:" // tail loop: unique 1: partial_0_12
360 "tbz %x[width], #0, 14f\n"
361 "ld1 { v3.b }[4], [x28], #0x1\n"
362 "ld1 { v12.b }[4], [x27], #0x1\n"
363 "ld1 { v11.b }[4], [x21], #0x1\n"
364 "ld1 { v9.b }[4], [x20], #0x1\n"
365 "b 14f\n"
366 "8:" // tail loop: unique 1: partial_1_8
367 "tbz %x[width], #1, 9f\n"
368 "ldr h3, [x28], #0x2\n"
369 "ldr h12, [x27], #0x2\n"
370 "ldr h11, [x21], #0x2\n"
371 "ldr h9, [x20], #0x2\n"
372 "tbz %x[width], #0, 14f\n"
373 "ld1 { v3.b }[2], [x28], #0x1\n"
374 "ld1 { v12.b }[2], [x27], #0x1\n"
375 "ld1 { v11.b }[2], [x21], #0x1\n"
376 "ld1 { v9.b }[2], [x20], #0x1\n"
377 "b 14f\n"
378 "9:" // tail loop: unique 1: partial_0_8
379 "tbz %x[width], #0, 14f\n"
380 "ldr b3, [x28], #0x1\n"
381 "ldr b12, [x27], #0x1\n"
382 "ldr b11, [x21], #0x1\n"
383 "ldr b9, [x20], #0x1\n"
384 "b 14f\n"
385 "10:" // tail loop: unique 1: partial_2_0
386 "tbz %x[width], #2, 12f\n"
387 "ldr s4, [x28], #0x4\n"
388 "ldr s13, [x27], #0x4\n"
389 "ldr s2, [x21], #0x4\n"
390 "ldr s10, [x20], #0x4\n"
391 "tbz %x[width], #1, 11f\n"
392 "ld1 { v4.h }[2], [x28], #0x2\n"
393 "ld1 { v13.h }[2], [x27], #0x2\n"
394 "ld1 { v2.h }[2], [x21], #0x2\n"
395 "ld1 { v10.h }[2], [x20], #0x2\n"
396 "tbz %x[width], #0, 14f\n"
397 "ld1 { v4.b }[6], [x28], #0x1\n"
398 "ld1 { v13.b }[6], [x27], #0x1\n"
399 "ld1 { v2.b }[6], [x21], #0x1\n"
400 "ld1 { v10.b }[6], [x20], #0x1\n"
401 "b 14f\n"
402 "11:" // tail loop: unique 1: partial_0_4
403 "tbz %x[width], #0, 14f\n"
404 "ld1 { v4.b }[4], [x28], #0x1\n"
405 "ld1 { v13.b }[4], [x27], #0x1\n"
406 "ld1 { v2.b }[4], [x21], #0x1\n"
407 "ld1 { v10.b }[4], [x20], #0x1\n"
408 "b 14f\n"
409 "12:" // tail loop: unique 1: partial_1_0
410 "tbz %x[width], #1, 13f\n"
411 "ldr h4, [x28], #0x2\n"
412 "ldr h13, [x27], #0x2\n"
413 "ldr h2, [x21], #0x2\n"
414 "ldr h10, [x20], #0x2\n"
415 "tbz %x[width], #0, 14f\n"
416 "ld1 { v4.b }[2], [x28], #0x1\n"
417 "ld1 { v13.b }[2], [x27], #0x1\n"
418 "ld1 { v2.b }[2], [x21], #0x1\n"
419 "ld1 { v10.b }[2], [x20], #0x1\n"
420 "b 14f\n"
421 "13:" // tail loop: unique 1: partial_0_0
422 "ldr b4, [x28], #0x1\n"
423 "ldr b13, [x27], #0x1\n"
424 "ldr b2, [x21], #0x1\n"
425 "ldr b10, [x20], #0x1\n"
426 "14:" // tail loop: unique 1: Done
427 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
428 "ushll v4.8h, v4.8b, #0x0\n"
429 "ushll v3.8h, v3.8b, #0x0\n"
430 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
431 "ushll v2.8h, v2.8b, #0x0\n"
432 "ushll v11.8h, v11.8b, #0x0\n"
433 "dup v16.8h, w21\n"
434 "ushll v13.8h, v13.8b, #0x0\n"
435 "ushll v12.8h, v12.8b, #0x0\n"
436 "ushll v10.8h, v10.8b, #0x0\n"
437 "ushll v9.8h, v9.8b, #0x0\n"
438 "ssubl v1.4s, v4.4h, v16.4h\n"
439 "ssubl2 v4.4s, v4.8h, v16.8h\n"
440 "ssubl v23.4s, v3.4h, v16.4h\n"
441 "ssubl2 v3.4s, v3.8h, v16.8h\n"
442 "ssubl v22.4s, v2.4h, v16.4h\n"
443 "ssubl2 v2.4s, v2.8h, v16.8h\n"
444 "ssubl v21.4s, v11.4h, v16.4h\n"
445 "ssubl2 v11.4s, v11.8h, v16.8h\n"
446 "dup v20.8h, w20\n"
447 "ssubl v19.4s, v13.4h, v20.4h\n"
448 "ssubl2 v13.4s, v13.8h, v20.8h\n"
449 "ssubl v18.4s, v12.4h, v20.4h\n"
450 "ssubl2 v12.4s, v12.8h, v20.8h\n"
451 "ssubl v17.4s, v10.4h, v20.4h\n"
452 "ssubl2 v10.4s, v10.8h, v20.8h\n"
453 "ssubl v16.4s, v9.4h, v20.4h\n"
454 "ssubl2 v9.4s, v9.8h, v20.8h\n"
455 "scvtf v8.4s, v1.4s\n"
456 "scvtf v7.4s, v4.4s\n"
457 "scvtf v6.4s, v23.4s\n"
458 "scvtf v5.4s, v3.4s\n"
459 "scvtf v4.4s, v22.4s\n"
460 "scvtf v3.4s, v2.4s\n"
461 "scvtf v2.4s, v21.4s\n"
462 "scvtf v1.4s, v11.4s\n"
463 "scvtf v19.4s, v19.4s\n"
464 "fmul v8.4s, v8.4s, v0.s[0]\n"
465 "fmla v8.4s, v19.4s, v0.s[1]\n"
466 "scvtf v13.4s, v13.4s\n"
467 "fmul v7.4s, v7.4s, v0.s[0]\n"
468 "fmla v7.4s, v13.4s, v0.s[1]\n"
469 "scvtf v18.4s, v18.4s\n"
470 "fmul v6.4s, v6.4s, v0.s[0]\n"
471 "fmla v6.4s, v18.4s, v0.s[1]\n"
472 "scvtf v12.4s, v12.4s\n"
473 "fmul v5.4s, v5.4s, v0.s[0]\n"
474 "fmla v5.4s, v12.4s, v0.s[1]\n"
475 "scvtf v17.4s, v17.4s\n"
476 "fmul v4.4s, v4.4s, v0.s[0]\n"
477 "fmla v4.4s, v17.4s, v0.s[1]\n"
478 "scvtf v10.4s, v10.4s\n"
479 "fmul v3.4s, v3.4s, v0.s[0]\n"
480 "fmla v3.4s, v10.4s, v0.s[1]\n"
481 "scvtf v16.4s, v16.4s\n"
482 "fmul v2.4s, v2.4s, v0.s[0]\n"
483 "fmla v2.4s, v16.4s, v0.s[1]\n"
484 "scvtf v9.4s, v9.4s\n"
485 "fmul v1.4s, v1.4s, v0.s[0]\n"
486 "fmla v1.4s, v9.4s, v0.s[1]\n"
487 "cbz %x[out_direct], 23f\n"
488 "fmul v23.4s, v8.4s, v0.s[3]\n"
489 "fmul v22.4s, v7.4s, v0.s[3]\n"
490 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
491 "fmul v21.4s, v6.4s, v0.s[3]\n"
492 "fmul v20.4s, v5.4s, v0.s[3]\n"
493 "fmul v19.4s, v4.4s, v0.s[3]\n"
494 "fmul v18.4s, v3.4s, v0.s[3]\n"
495 "fmul v16.4s, v2.4s, v0.s[3]\n"
496 "fmul v17.4s, v1.4s, v0.s[3]\n"
497 "fcvtas v23.4s, v23.4s\n"
498 "fcvtas v22.4s, v22.4s\n"
499 "fcvtas v21.4s, v21.4s\n"
500 "fcvtas v20.4s, v20.4s\n"
501 "fcvtas v19.4s, v19.4s\n"
502 "fcvtas v18.4s, v18.4s\n"
503 "fcvtas v16.4s, v16.4s\n"
504 "fcvtas v17.4s, v17.4s\n"
505 "uzp1 v22.8h, v23.8h, v22.8h\n"
506 "uzp1 v20.8h, v21.8h, v20.8h\n"
507 "uzp1 v18.8h, v19.8h, v18.8h\n"
508 "uzp1 v17.8h, v16.8h, v17.8h\n"
509 "dup v16.8h, w20\n"
510 "add v22.8h, v22.8h, v16.8h\n"
511 "add v20.8h, v20.8h, v16.8h\n"
512 "add v18.8h, v18.8h, v16.8h\n"
513 "add v17.8h, v17.8h, v16.8h\n"
514 "movi v16.8h, #0xff\n"
515 "smin v22.8h, v22.8h, v16.8h\n"
516 "smin v20.8h, v20.8h, v16.8h\n"
517 "smin v18.8h, v18.8h, v16.8h\n"
518 "smin v17.8h, v17.8h, v16.8h\n"
519 "movi v16.8h, #0x0\n"
520 "smax v22.8h, v22.8h, v16.8h\n"
521 "smax v20.8h, v20.8h, v16.8h\n"
522 "smax v18.8h, v18.8h, v16.8h\n"
523 "smax v17.8h, v17.8h, v16.8h\n"
524 "xtn v22.8b, v22.8h\n"
525 "xtn v20.8b, v20.8h\n"
526 "xtn v18.8b, v18.8h\n"
527 "xtn v17.8b, v17.8h\n"
528 "tbz %x[width], #3, 18f\n"
529 "str d22, [x25, #0x0]\n"
530 "add x25, x25, #0x8\n"
531 "str d18, [x22, #0x0]\n"
532 "add x22, x22, #0x8\n"
533 "tbz %x[width], #2, 16f\n"
534 "str s20, [x25], #0x4\n"
535 "str s17, [x22], #0x4\n"
536 "tbz %x[width], #1, 15f\n"
537 "st1 { v20.h }[2], [x25], #0x2\n"
538 "st1 { v17.h }[2], [x22], #0x2\n"
539 "tbz %x[width], #0, 22f\n"
540 "st1 { v20.b }[6], [x25], #0x1\n"
541 "st1 { v17.b }[6], [x22], #0x1\n"
542 "b 22f\n"
543 "15:" // tail loop: Main loop: unique 2: partial_0_12
544 "tbz %x[width], #0, 22f\n"
545 "st1 { v20.b }[4], [x25], #0x1\n"
546 "st1 { v17.b }[4], [x22], #0x1\n"
547 "b 22f\n"
548 "16:" // tail loop: Main loop: unique 2: partial_1_8
549 "tbz %x[width], #1, 17f\n"
550 "str h20, [x25], #0x2\n"
551 "str h17, [x22], #0x2\n"
552 "tbz %x[width], #0, 22f\n"
553 "st1 { v20.b }[2], [x25], #0x1\n"
554 "st1 { v17.b }[2], [x22], #0x1\n"
555 "b 22f\n"
556 "17:" // tail loop: Main loop: unique 2: partial_0_8
557 "tbz %x[width], #0, 22f\n"
558 "str b20, [x25], #0x1\n"
559 "str b17, [x22], #0x1\n"
560 "b 22f\n"
561 "18:" // tail loop: Main loop: unique 2: partial_2_0
562 "tbz %x[width], #2, 20f\n"
563 "str s22, [x25], #0x4\n"
564 "str s18, [x22], #0x4\n"
565 "tbz %x[width], #1, 19f\n"
566 "st1 { v22.h }[2], [x25], #0x2\n"
567 "st1 { v18.h }[2], [x22], #0x2\n"
568 "tbz %x[width], #0, 22f\n"
569 "st1 { v22.b }[6], [x25], #0x1\n"
570 "st1 { v18.b }[6], [x22], #0x1\n"
571 "b 22f\n"
572 "19:" // tail loop: Main loop: unique 2: partial_0_4
573 "tbz %x[width], #0, 22f\n"
574 "st1 { v22.b }[4], [x25], #0x1\n"
575 "st1 { v18.b }[4], [x22], #0x1\n"
576 "b 22f\n"
577 "20:" // tail loop: Main loop: unique 2: partial_1_0
578 "tbz %x[width], #1, 21f\n"
579 "str h22, [x25], #0x2\n"
580 "str h18, [x22], #0x2\n"
581 "tbz %x[width], #0, 22f\n"
582 "st1 { v22.b }[2], [x25], #0x1\n"
583 "st1 { v18.b }[2], [x22], #0x1\n"
584 "b 22f\n"
585 "21:" // tail loop: Main loop: unique 2: partial_0_0
586 "str b22, [x25], #0x1\n"
587 "str b18, [x22], #0x1\n"
588 "22:" // tail loop: Main loop: unique 2: Done
589 "23:" // tail loop: Main loop: No direct output
590 "mov v19.16b, v28.16b\n"
591 "mov v13.16b, v29.16b\n"
592 "fmla v19.4s, v8.4s, v24.4s\n"
593 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
594 "mov v18.16b, v30.16b\n"
595 "mov v12.16b, v31.16b\n"
596 "fmla v13.4s, v7.4s, v25.4s\n"
597 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
598 "mov v17.16b, v28.16b\n"
599 "mov v10.16b, v29.16b\n"
600 "fmla v18.4s, v6.4s, v26.4s\n"
601 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
602 "mov v16.16b, v30.16b\n"
603 "mov v9.16b, v31.16b\n"
604 "fmla v12.4s, v5.4s, v27.4s\n"
605 "fmla v17.4s, v4.4s, v24.4s\n"
606 "fmla v10.4s, v3.4s, v25.4s\n"
607 "fmul v8.4s, v19.4s, v0.s[2]\n"
608 "fmla v16.4s, v2.4s, v26.4s\n"
609 "fmla v9.4s, v1.4s, v27.4s\n"
610 "fmul v7.4s, v13.4s, v0.s[2]\n"
611 "fmul v6.4s, v18.4s, v0.s[2]\n"
612 "fmul v5.4s, v12.4s, v0.s[2]\n"
613 "fmul v4.4s, v17.4s, v0.s[2]\n"
614 "fmul v3.4s, v10.4s, v0.s[2]\n"
615 "fmul v2.4s, v16.4s, v0.s[2]\n"
616 "fmul v1.4s, v9.4s, v0.s[2]\n"
617 "fcvtas v8.4s, v8.4s\n"
618 "fcvtas v7.4s, v7.4s\n"
619 "fcvtas v6.4s, v6.4s\n"
620 "fcvtas v5.4s, v5.4s\n"
621 "fcvtas v4.4s, v4.4s\n"
622 "fcvtas v3.4s, v3.4s\n"
623 "fcvtas v2.4s, v2.4s\n"
624 "fcvtas v1.4s, v1.4s\n"
625 "uzp1 v7.8h, v8.8h, v7.8h\n"
626 "uzp1 v5.8h, v6.8h, v5.8h\n"
627 "uzp1 v3.8h, v4.8h, v3.8h\n"
628 "uzp1 v1.8h, v2.8h, v1.8h\n"
629 "dup v16.8h, w22\n"
630 "add v7.8h, v7.8h, v16.8h\n"
631 "add v5.8h, v5.8h, v16.8h\n"
632 "add v3.8h, v3.8h, v16.8h\n"
633 "add v1.8h, v1.8h, v16.8h\n"
634 "dup v16.8h, w21\n"
635 "smin v7.8h, v7.8h, v16.8h\n"
636 "smin v5.8h, v5.8h, v16.8h\n"
637 "smin v3.8h, v3.8h, v16.8h\n"
638 "smin v1.8h, v1.8h, v16.8h\n"
639 "dup v16.8h, w20\n"
640 "smax v7.8h, v7.8h, v16.8h\n"
641 "smax v5.8h, v5.8h, v16.8h\n"
642 "smax v3.8h, v3.8h, v16.8h\n"
643 "smax v1.8h, v1.8h, v16.8h\n"
644 "xtn v7.8b, v7.8h\n"
645 "xtn v5.8b, v5.8h\n"
646 "xtn v3.8b, v3.8h\n"
647 "xtn v1.8b, v1.8h\n"
648 "tbz %x[width], #3, 27f\n"
649 "str d7, [x26, #0x0]\n"
650 "add x26, x26, #0x8\n"
651 "str d3, [x24, #0x0]\n"
652 "add x24, x24, #0x8\n"
653 "tbz %x[width], #2, 25f\n"
654 "str s5, [x26], #0x4\n"
655 "str s1, [x24], #0x4\n"
656 "tbz %x[width], #1, 24f\n"
657 "st1 { v5.h }[2], [x26], #0x2\n"
658 "st1 { v1.h }[2], [x24], #0x2\n"
659 "tbz %x[width], #0, 31f\n"
660 "st1 { v5.b }[6], [x26], #0x1\n"
661 "st1 { v1.b }[6], [x24], #0x1\n"
662 "b 31f\n"
663 "24:" // tail loop: unique 3: partial_0_12
664 "tbz %x[width], #0, 31f\n"
665 "st1 { v5.b }[4], [x26], #0x1\n"
666 "st1 { v1.b }[4], [x24], #0x1\n"
667 "b 31f\n"
668 "25:" // tail loop: unique 3: partial_1_8
669 "tbz %x[width], #1, 26f\n"
670 "str h5, [x26], #0x2\n"
671 "str h1, [x24], #0x2\n"
672 "tbz %x[width], #0, 31f\n"
673 "st1 { v5.b }[2], [x26], #0x1\n"
674 "st1 { v1.b }[2], [x24], #0x1\n"
675 "b 31f\n"
676 "26:" // tail loop: unique 3: partial_0_8
677 "tbz %x[width], #0, 31f\n"
678 "str b5, [x26], #0x1\n"
679 "str b1, [x24], #0x1\n"
680 "b 31f\n"
681 "27:" // tail loop: unique 3: partial_2_0
682 "tbz %x[width], #2, 29f\n"
683 "str s7, [x26], #0x4\n"
684 "str s3, [x24], #0x4\n"
685 "tbz %x[width], #1, 28f\n"
686 "st1 { v7.h }[2], [x26], #0x2\n"
687 "st1 { v3.h }[2], [x24], #0x2\n"
688 "tbz %x[width], #0, 31f\n"
689 "st1 { v7.b }[6], [x26], #0x1\n"
690 "st1 { v3.b }[6], [x24], #0x1\n"
691 "b 31f\n"
692 "28:" // tail loop: unique 3: partial_0_4
693 "tbz %x[width], #0, 31f\n"
694 "st1 { v7.b }[4], [x26], #0x1\n"
695 "st1 { v3.b }[4], [x24], #0x1\n"
696 "b 31f\n"
697 "29:" // tail loop: unique 3: partial_1_0
698 "tbz %x[width], #1, 30f\n"
699 "str h7, [x26], #0x2\n"
700 "str h3, [x24], #0x2\n"
701 "tbz %x[width], #0, 31f\n"
702 "st1 { v7.b }[2], [x26], #0x1\n"
703 "st1 { v3.b }[2], [x24], #0x1\n"
704 "b 31f\n"
705 "30:" // tail loop: unique 3: partial_0_0
706 "str b7, [x26], #0x1\n"
707 "str b3, [x24], #0x1\n"
708 "31:" // tail loop: unique 3: Done
709 "subs x23, x23, #0x2\n"
710 "bgt 6b\n"
711 "32:" // odd columns skip
712 : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
713 : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
714 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
715}
716
717} // namespace
718
719namespace arm_compute
720{
721namespace cpu
722{
723void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
724 ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
725{
726 ARM_COMPUTE_UNUSED(policy);
727
728 const ITensorInfo *final_output_info = final_output->info();
729 const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr;
730 const ITensorInfo *input1_info = input1->info();
731 const ITensorInfo *input2_info = input2->info();
732
733 const size_t out_stride = final_output_info->strides_in_bytes()[1];
734 const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
735 const size_t in0_stride = input1_info->strides_in_bytes()[1];
736 const size_t in1_stride = input2_info->strides_in_bytes()[1];
737
738 uint8_t minval = std::numeric_limits<uint8_t>::lowest();
739 uint8_t maxval = std::numeric_limits<uint8_t>::max();
740
741 const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
742 if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
743 {
744 minval = quantize_qasymm8(0.f, final_output_qinfo);
745 }
746 else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
747 {
748 minval = quantize_qasymm8(0.f, final_output_qinfo);
749 maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
750 }
751 else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
752 {
753 minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
754 maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
755 }
756
757 const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
758 const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
759 const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
760
761 const int32_t in1_offset = in1_qinfo.offset;
762 const int32_t in2_offset = in2_qinfo.offset;
763 const int32_t out_offset = final_output_qinfo.offset;
764 const int32_t out_direct_offset = add_output_qinfo.offset;
765
766 const float in1_scale = in1_qinfo.scale;
767 const float in2_scale = in2_qinfo.scale;
768 const float out_scale = final_output_qinfo.scale;
769 const float out_direct_scale = add_output_qinfo.scale;
770
771 const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer());
772 const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer());
773
774 // Clear X & Y dimensions on execution window as we handle manually
775 Window win = window;
776 win.set(Window::DimX, Window::Dimension(0, 1, 1));
777 win.set(Window::DimY, Window::Dimension(0, 1, 1));
778
779 Iterator in1_it(input1, window);
780 Iterator in2_it(input2, window);
781 Iterator out_it(final_output, window);
782
783 const size_t width = window.num_iterations(0);
784 const size_t height = window.num_iterations(1);
785
786 if(add_output != nullptr)
787 {
788 Iterator add_out_it(add_output, window);
789 execute_window_loop(
790 win, [&](const Coordinates &)
791 {
792 a64_add_bn_clamp_direct_u8_fp32_2x16(
793 reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
794 reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
795 reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
796 reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
797 bn_mul_buffer,
798 bn_add_buffer,
799 minval,
800 maxval,
801 out_offset, out_scale,
802 out_direct_offset, out_direct_scale,
803 in1_offset, in1_scale,
804 in2_offset, in2_scale,
805 width, height);
806 },
807 in1_it, in2_it, add_out_it, out_it);
808 }
809 else
810 {
811 execute_window_loop(
812 win, [&](const Coordinates &)
813 {
814 a64_add_bn_clamp_direct_u8_fp32_2x16(
815 reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
816 nullptr, out_direct_stride,
817 reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
818 reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
819 bn_mul_buffer,
820 bn_add_buffer,
821 minval,
822 maxval,
823 out_offset, out_scale,
824 out_direct_offset, out_direct_scale,
825 in1_offset, in1_scale,
826 in2_offset, in2_scale,
827 width, height);
828 },
829 in1_it, in2_it, out_it);
830 }
831}
832} // namespace cpu
833} // namespace arm_compute
834
835#endif // __aarch64__