blob: 3bde83cc267cc3a3a379444942bc788609483756 [file] [log] [blame]
Gunes Bayirae72a462023-01-29 13:24:24 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm_compute/core/Helpers.h"
26#include "arm_compute/core/ITensor.h"
27#include "arm_compute/core/QuantizationInfo.h"
28#include "arm_compute/core/Types.h"
29#include "arm_compute/core/Window.h"
30
31#include <cstddef>
32#include <cstdint>
33#include <limits>
34
35#ifdef __aarch64__
36namespace
37{
38void a64_add_bn_clamp_direct_s8_fp32_2x16(
39 int8_t *out, size_t out_stride,
40 int8_t *out_direct, size_t out_direct_stride,
41 const int8_t *in0, size_t in0_stride,
42 const int8_t *in1, size_t in1_stride,
43 const float *bn_mul,
44 const float *bn_add,
45 const int8_t minval,
46 const int8_t maxval,
47 int32_t out_zeropt, float out_scale,
48 int32_t out_direct_zeropt, float out_direct_scale,
49 int32_t in0_zeropt, float in0_scale,
50 int32_t in1_zeropt, float in1_scale,
51 size_t width, size_t height)
52{
53 float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
54 struct KernelArgs
55 {
56 const float *scales;
57 int32_t in0_zeropt;
58 int32_t in1_zeropt;
59 int32_t out_zeropt;
60 int32_t out_direct_zeropt;
61 int32_t minval;
62 int32_t maxval;
63 } ka;
64 ka.scales = scales;
65 ka.in0_zeropt = in0_zeropt;
66 ka.in1_zeropt = in1_zeropt;
67 ka.out_zeropt = out_zeropt;
68 ka.out_direct_zeropt = out_direct_zeropt;
69 ka.minval = minval;
70 ka.maxval = maxval;
71
72 __asm__ __volatile__(
73 "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
74 "ld1 { v0.4s }, [x20]\n"
75 "cmp %x[width], #0x10\n"
76 "blt 5f\n"
77 "1:" // Column loop
78 "ldr q24, [%x[bn_mul], #0x0]\n"
79 "ldr q25, [%x[bn_mul], #0x10]\n"
80 "mov x23, %x[height]\n"
81 "mov x12, %x[in0]\n"
82 "ldr q26, [%x[bn_mul], #0x20]\n"
83 "ldr q27, [%x[bn_mul], #0x30]\n"
84 "mov x11, %x[in1]\n"
85 "mov x10, %x[out]\n"
86 "ldr q28, [%x[bn_add], #0x0]\n"
87 "ldr q29, [%x[bn_add], #0x10]\n"
88 "mov x9, %x[out_direct]\n"
89 "add %x[bn_mul], %x[bn_mul], #0x40\n"
90 "ldr q30, [%x[bn_add], #0x20]\n"
91 "ldr q31, [%x[bn_add], #0x30]\n"
92 "add %x[bn_add], %x[bn_add], #0x40\n"
93 "2:" // Row loop
94 "mov x28, x12\n"
95 "ldr d4, [x28, #0x0]\n"
96 "ldr d3, [x28, #0x8]\n"
97 "add x21, x28, %x[in0_stride]\n"
98 "mov x27, x11\n"
99 "ldr d13, [x27, #0x0]\n"
100 "ldr d12, [x27, #0x8]\n"
101 "cmp x23, #0x2\n"
102 "add x12, x21, %x[in0_stride]\n"
103 "csel x21, x21, x28, GE\n"
104 "ldr d2, [x21, #0x0]\n"
105 "ldr d11, [x21, #0x8]\n"
106 "add x20, x27, %x[in1_stride]\n"
107 "add x11, x20, %x[in1_stride]\n"
108 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
109 "sshll v4.8h, v4.8b, #0x0\n"
110 "csel x20, x20, x27, GE\n"
111 "ldr d10, [x20, #0x0]\n"
112 "ldr d9, [x20, #0x8]\n"
113 "sshll v3.8h, v3.8b, #0x0\n"
114 "sshll v2.8h, v2.8b, #0x0\n"
115 "sshll v11.8h, v11.8b, #0x0\n"
116 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
117 "mov x26, x10\n"
118 "dup v16.8h, w21\n"
119 "sshll v13.8h, v13.8b, #0x0\n"
120 "mov x25, x9\n"
121 "add x24, x26, %x[out_stride]\n"
122 "sshll v12.8h, v12.8b, #0x0\n"
123 "sshll v10.8h, v10.8b, #0x0\n"
124 "add x22, x25, %x[out_direct_stride]\n"
125 "add x10, x24, %x[out_stride]\n"
126 "sshll v9.8h, v9.8b, #0x0\n"
127 "ssubl v1.4s, v4.4h, v16.4h\n"
128 "add x9, x22, %x[out_direct_stride]\n"
129 "csel x24, x24, x26, GE\n"
130 "ssubl2 v4.4s, v4.8h, v16.8h\n"
131 "ssubl v23.4s, v3.4h, v16.4h\n"
132 "csel x22, x22, x25, GE\n"
133 "ssubl2 v3.4s, v3.8h, v16.8h\n"
134 "ssubl v22.4s, v2.4h, v16.4h\n"
135 "ssubl2 v2.4s, v2.8h, v16.8h\n"
136 "ssubl v21.4s, v11.4h, v16.4h\n"
137 "ssubl2 v11.4s, v11.8h, v16.8h\n"
138 "dup v20.8h, w20\n"
139 "ssubl v19.4s, v13.4h, v20.4h\n"
140 "ssubl2 v13.4s, v13.8h, v20.8h\n"
141 "ssubl v18.4s, v12.4h, v20.4h\n"
142 "ssubl2 v12.4s, v12.8h, v20.8h\n"
143 "ssubl v17.4s, v10.4h, v20.4h\n"
144 "ssubl2 v10.4s, v10.8h, v20.8h\n"
145 "ssubl v16.4s, v9.4h, v20.4h\n"
146 "ssubl2 v9.4s, v9.8h, v20.8h\n"
147 "scvtf v8.4s, v1.4s\n"
148 "scvtf v7.4s, v4.4s\n"
149 "scvtf v6.4s, v23.4s\n"
150 "scvtf v5.4s, v3.4s\n"
151 "scvtf v4.4s, v22.4s\n"
152 "scvtf v3.4s, v2.4s\n"
153 "scvtf v2.4s, v21.4s\n"
154 "scvtf v1.4s, v11.4s\n"
155 "scvtf v19.4s, v19.4s\n"
156 "fmul v8.4s, v8.4s, v0.s[0]\n"
157 "fmla v8.4s, v19.4s, v0.s[1]\n"
158 "scvtf v13.4s, v13.4s\n"
159 "fmul v7.4s, v7.4s, v0.s[0]\n"
160 "fmla v7.4s, v13.4s, v0.s[1]\n"
161 "scvtf v18.4s, v18.4s\n"
162 "fmul v6.4s, v6.4s, v0.s[0]\n"
163 "fmla v6.4s, v18.4s, v0.s[1]\n"
164 "scvtf v12.4s, v12.4s\n"
165 "fmul v5.4s, v5.4s, v0.s[0]\n"
166 "fmla v5.4s, v12.4s, v0.s[1]\n"
167 "scvtf v17.4s, v17.4s\n"
168 "fmul v4.4s, v4.4s, v0.s[0]\n"
169 "fmla v4.4s, v17.4s, v0.s[1]\n"
170 "scvtf v10.4s, v10.4s\n"
171 "fmul v3.4s, v3.4s, v0.s[0]\n"
172 "fmla v3.4s, v10.4s, v0.s[1]\n"
173 "scvtf v16.4s, v16.4s\n"
174 "fmul v2.4s, v2.4s, v0.s[0]\n"
175 "fmla v2.4s, v16.4s, v0.s[1]\n"
176 "scvtf v9.4s, v9.4s\n"
177 "fmul v1.4s, v1.4s, v0.s[0]\n"
178 "fmla v1.4s, v9.4s, v0.s[1]\n"
179 "cbz %x[out_direct], 3f\n"
180 "fmul v23.4s, v8.4s, v0.s[3]\n"
181 "fmul v22.4s, v7.4s, v0.s[3]\n"
182 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
183 "fmul v21.4s, v6.4s, v0.s[3]\n"
184 "fmul v20.4s, v5.4s, v0.s[3]\n"
185 "fmul v17.4s, v4.4s, v0.s[3]\n"
186 "fmul v19.4s, v3.4s, v0.s[3]\n"
187 "fmul v16.4s, v2.4s, v0.s[3]\n"
188 "fmul v18.4s, v1.4s, v0.s[3]\n"
189 "fcvtas v23.4s, v23.4s\n"
190 "fcvtas v22.4s, v22.4s\n"
191 "fcvtas v21.4s, v21.4s\n"
192 "fcvtas v20.4s, v20.4s\n"
193 "fcvtas v17.4s, v17.4s\n"
194 "fcvtas v19.4s, v19.4s\n"
195 "fcvtas v16.4s, v16.4s\n"
196 "fcvtas v18.4s, v18.4s\n"
197 "uzp1 v22.8h, v23.8h, v22.8h\n"
198 "uzp1 v20.8h, v21.8h, v20.8h\n"
199 "uzp1 v19.8h, v17.8h, v19.8h\n"
200 "uzp1 v18.8h, v16.8h, v18.8h\n"
201 "dup v16.8h, w20\n"
202 "add v22.8h, v22.8h, v16.8h\n"
203 "add v20.8h, v20.8h, v16.8h\n"
204 "add v19.8h, v19.8h, v16.8h\n"
205 "add v18.8h, v18.8h, v16.8h\n"
206 "movi v17.8h, #0x7f\n"
207 "mvni v16.8h, #0x7f\n"
208 "smin v22.8h, v22.8h, v17.8h\n"
209 "smin v20.8h, v20.8h, v17.8h\n"
210 "smin v19.8h, v19.8h, v17.8h\n"
211 "smin v18.8h, v18.8h, v17.8h\n"
212 "smax v22.8h, v22.8h, v16.8h\n"
213 "smax v20.8h, v20.8h, v16.8h\n"
214 "smax v19.8h, v19.8h, v16.8h\n"
215 "smax v18.8h, v18.8h, v16.8h\n"
216 "xtn v22.8b, v22.8h\n"
217 "str d22, [x25, #0x0]\n"
218 "xtn v20.8b, v20.8h\n"
219 "xtn v19.8b, v19.8h\n"
220 "str d20, [x25, #0x8]\n"
221 "xtn v18.8b, v18.8h\n"
222 "str d19, [x22, #0x0]\n"
223 "str d18, [x22, #0x8]\n"
224 "3:" // Main loop: No direct output
225 "mov v19.16b, v28.16b\n"
226 "mov v13.16b, v29.16b\n"
227 "fmla v19.4s, v8.4s, v24.4s\n"
228 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
229 "mov v18.16b, v30.16b\n"
230 "mov v12.16b, v31.16b\n"
231 "fmla v13.4s, v7.4s, v25.4s\n"
232 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
233 "mov v17.16b, v28.16b\n"
234 "mov v10.16b, v29.16b\n"
235 "fmla v18.4s, v6.4s, v26.4s\n"
236 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
237 "mov v16.16b, v30.16b\n"
238 "mov v9.16b, v31.16b\n"
239 "fmla v12.4s, v5.4s, v27.4s\n"
240 "subs x23, x23, #0x2\n"
241 "fmla v17.4s, v4.4s, v24.4s\n"
242 "fmla v10.4s, v3.4s, v25.4s\n"
243 "fmul v8.4s, v19.4s, v0.s[2]\n"
244 "fmla v16.4s, v2.4s, v26.4s\n"
245 "fmla v9.4s, v1.4s, v27.4s\n"
246 "fmul v7.4s, v13.4s, v0.s[2]\n"
247 "fmul v6.4s, v18.4s, v0.s[2]\n"
248 "fmul v5.4s, v12.4s, v0.s[2]\n"
249 "fmul v4.4s, v17.4s, v0.s[2]\n"
250 "fmul v3.4s, v10.4s, v0.s[2]\n"
251 "fmul v2.4s, v16.4s, v0.s[2]\n"
252 "fmul v1.4s, v9.4s, v0.s[2]\n"
253 "fcvtas v8.4s, v8.4s\n"
254 "fcvtas v7.4s, v7.4s\n"
255 "fcvtas v6.4s, v6.4s\n"
256 "fcvtas v5.4s, v5.4s\n"
257 "fcvtas v4.4s, v4.4s\n"
258 "fcvtas v3.4s, v3.4s\n"
259 "fcvtas v2.4s, v2.4s\n"
260 "fcvtas v1.4s, v1.4s\n"
261 "uzp1 v7.8h, v8.8h, v7.8h\n"
262 "uzp1 v5.8h, v6.8h, v5.8h\n"
263 "uzp1 v3.8h, v4.8h, v3.8h\n"
264 "uzp1 v1.8h, v2.8h, v1.8h\n"
265 "dup v16.8h, w22\n"
266 "add v7.8h, v7.8h, v16.8h\n"
267 "add v5.8h, v5.8h, v16.8h\n"
268 "add v3.8h, v3.8h, v16.8h\n"
269 "add v1.8h, v1.8h, v16.8h\n"
270 "dup v16.8h, w21\n"
271 "smin v7.8h, v7.8h, v16.8h\n"
272 "smin v5.8h, v5.8h, v16.8h\n"
273 "smin v3.8h, v3.8h, v16.8h\n"
274 "smin v1.8h, v1.8h, v16.8h\n"
275 "dup v16.8h, w20\n"
276 "smax v7.8h, v7.8h, v16.8h\n"
277 "smax v5.8h, v5.8h, v16.8h\n"
278 "smax v3.8h, v3.8h, v16.8h\n"
279 "smax v1.8h, v1.8h, v16.8h\n"
280 "xtn v7.8b, v7.8h\n"
281 "str d7, [x26, #0x0]\n"
282 "xtn v5.8b, v5.8h\n"
283 "xtn v3.8b, v3.8h\n"
284 "str d5, [x26, #0x8]\n"
285 "xtn v1.8b, v1.8h\n"
286 "str d3, [x24, #0x0]\n"
287 "str d1, [x24, #0x8]\n"
288 "bgt 2b\n"
289 "add %x[in0], %x[in0], #0x10\n"
290 "add %x[in1], %x[in1], #0x10\n"
291 "add %x[out], %x[out], #0x10\n"
292 "cbz %x[out_direct], 4f\n"
293 "add %x[out_direct], %x[out_direct], #0x10\n"
294 "4:" // No direct pointer update
295 "sub %x[width], %x[width], #0x10\n"
296 "cmp %x[width], #0x10\n"
297 "bge 1b\n"
298 "cbz %x[width], 32f\n"
299 "5:" // main loop skip
300 "ldr q24, [%x[bn_mul], #0x0]\n"
301 "ldr q25, [%x[bn_mul], #0x10]\n"
302 "mov x23, %x[height]\n"
303 "mov x12, %x[in0]\n"
304 "ldr q26, [%x[bn_mul], #0x20]\n"
305 "ldr q27, [%x[bn_mul], #0x30]\n"
306 "mov x11, %x[in1]\n"
307 "mov x10, %x[out]\n"
308 "ldr q28, [%x[bn_add], #0x0]\n"
309 "ldr q29, [%x[bn_add], #0x10]\n"
310 "mov x9, %x[out_direct]\n"
311 "add %x[bn_mul], %x[bn_mul], #0x40\n"
312 "ldr q30, [%x[bn_add], #0x20]\n"
313 "ldr q31, [%x[bn_add], #0x30]\n"
314 "add %x[bn_add], %x[bn_add], #0x40\n"
315 "6:" // tail loop: Row loop
316 "mov x28, x12\n"
317 "mov x27, x11\n"
318 "mov x26, x10\n"
319 "mov x25, x9\n"
320 "add x21, x28, %x[in0_stride]\n"
321 "add x20, x27, %x[in1_stride]\n"
322 "add x24, x26, %x[out_stride]\n"
323 "add x22, x25, %x[out_direct_stride]\n"
324 "cmp x23, #0x2\n"
325 "add x12, x21, %x[in0_stride]\n"
326 "add x11, x20, %x[in1_stride]\n"
327 "add x10, x24, %x[out_stride]\n"
328 "add x9, x22, %x[out_direct_stride]\n"
329 "csel x21, x21, x28, GE\n"
330 "csel x20, x20, x27, GE\n"
331 "csel x24, x24, x26, GE\n"
332 "csel x22, x22, x25, GE\n"
333 "tbz %x[width], #3, 10f\n"
334 "ldr d4, [x28, #0x0]\n"
335 "ldr d13, [x27, #0x0]\n"
336 "add x28, x28, #0x8\n"
337 "add x27, x27, #0x8\n"
338 "ldr d2, [x21, #0x0]\n"
339 "ldr d10, [x20, #0x0]\n"
340 "add x21, x21, #0x8\n"
341 "add x20, x20, #0x8\n"
342 "tbz %x[width], #2, 8f\n"
343 "ldr s3, [x28], #0x4\n"
344 "ldr s12, [x27], #0x4\n"
345 "ldr s11, [x21], #0x4\n"
346 "ldr s9, [x20], #0x4\n"
347 "tbz %x[width], #1, 7f\n"
348 "ld1 { v3.h }[2], [x28], #0x2\n"
349 "ld1 { v12.h }[2], [x27], #0x2\n"
350 "ld1 { v11.h }[2], [x21], #0x2\n"
351 "ld1 { v9.h }[2], [x20], #0x2\n"
352 "tbz %x[width], #0, 14f\n"
353 "ld1 { v3.b }[6], [x28], #0x1\n"
354 "ld1 { v12.b }[6], [x27], #0x1\n"
355 "ld1 { v11.b }[6], [x21], #0x1\n"
356 "ld1 { v9.b }[6], [x20], #0x1\n"
357 "b 14f\n"
358 "7:" // tail loop: unique 1: partial_0_12
359 "tbz %x[width], #0, 14f\n"
360 "ld1 { v3.b }[4], [x28], #0x1\n"
361 "ld1 { v12.b }[4], [x27], #0x1\n"
362 "ld1 { v11.b }[4], [x21], #0x1\n"
363 "ld1 { v9.b }[4], [x20], #0x1\n"
364 "b 14f\n"
365 "8:" // tail loop: unique 1: partial_1_8
366 "tbz %x[width], #1, 9f\n"
367 "ldr h3, [x28], #0x2\n"
368 "ldr h12, [x27], #0x2\n"
369 "ldr h11, [x21], #0x2\n"
370 "ldr h9, [x20], #0x2\n"
371 "tbz %x[width], #0, 14f\n"
372 "ld1 { v3.b }[2], [x28], #0x1\n"
373 "ld1 { v12.b }[2], [x27], #0x1\n"
374 "ld1 { v11.b }[2], [x21], #0x1\n"
375 "ld1 { v9.b }[2], [x20], #0x1\n"
376 "b 14f\n"
377 "9:" // tail loop: unique 1: partial_0_8
378 "tbz %x[width], #0, 14f\n"
379 "ldr b3, [x28], #0x1\n"
380 "ldr b12, [x27], #0x1\n"
381 "ldr b11, [x21], #0x1\n"
382 "ldr b9, [x20], #0x1\n"
383 "b 14f\n"
384 "10:" // tail loop: unique 1: partial_2_0
385 "tbz %x[width], #2, 12f\n"
386 "ldr s4, [x28], #0x4\n"
387 "ldr s13, [x27], #0x4\n"
388 "ldr s2, [x21], #0x4\n"
389 "ldr s10, [x20], #0x4\n"
390 "tbz %x[width], #1, 11f\n"
391 "ld1 { v4.h }[2], [x28], #0x2\n"
392 "ld1 { v13.h }[2], [x27], #0x2\n"
393 "ld1 { v2.h }[2], [x21], #0x2\n"
394 "ld1 { v10.h }[2], [x20], #0x2\n"
395 "tbz %x[width], #0, 14f\n"
396 "ld1 { v4.b }[6], [x28], #0x1\n"
397 "ld1 { v13.b }[6], [x27], #0x1\n"
398 "ld1 { v2.b }[6], [x21], #0x1\n"
399 "ld1 { v10.b }[6], [x20], #0x1\n"
400 "b 14f\n"
401 "11:" // tail loop: unique 1: partial_0_4
402 "tbz %x[width], #0, 14f\n"
403 "ld1 { v4.b }[4], [x28], #0x1\n"
404 "ld1 { v13.b }[4], [x27], #0x1\n"
405 "ld1 { v2.b }[4], [x21], #0x1\n"
406 "ld1 { v10.b }[4], [x20], #0x1\n"
407 "b 14f\n"
408 "12:" // tail loop: unique 1: partial_1_0
409 "tbz %x[width], #1, 13f\n"
410 "ldr h4, [x28], #0x2\n"
411 "ldr h13, [x27], #0x2\n"
412 "ldr h2, [x21], #0x2\n"
413 "ldr h10, [x20], #0x2\n"
414 "tbz %x[width], #0, 14f\n"
415 "ld1 { v4.b }[2], [x28], #0x1\n"
416 "ld1 { v13.b }[2], [x27], #0x1\n"
417 "ld1 { v2.b }[2], [x21], #0x1\n"
418 "ld1 { v10.b }[2], [x20], #0x1\n"
419 "b 14f\n"
420 "13:" // tail loop: unique 1: partial_0_0
421 "ldr b4, [x28], #0x1\n"
422 "ldr b13, [x27], #0x1\n"
423 "ldr b2, [x21], #0x1\n"
424 "ldr b10, [x20], #0x1\n"
425 "14:" // tail loop: unique 1: Done
426 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
427 "sshll v4.8h, v4.8b, #0x0\n"
428 "sshll v3.8h, v3.8b, #0x0\n"
429 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
430 "sshll v2.8h, v2.8b, #0x0\n"
431 "sshll v11.8h, v11.8b, #0x0\n"
432 "dup v16.8h, w21\n"
433 "sshll v13.8h, v13.8b, #0x0\n"
434 "sshll v12.8h, v12.8b, #0x0\n"
435 "sshll v10.8h, v10.8b, #0x0\n"
436 "sshll v9.8h, v9.8b, #0x0\n"
437 "ssubl v1.4s, v4.4h, v16.4h\n"
438 "ssubl2 v4.4s, v4.8h, v16.8h\n"
439 "ssubl v23.4s, v3.4h, v16.4h\n"
440 "ssubl2 v3.4s, v3.8h, v16.8h\n"
441 "ssubl v22.4s, v2.4h, v16.4h\n"
442 "ssubl2 v2.4s, v2.8h, v16.8h\n"
443 "ssubl v21.4s, v11.4h, v16.4h\n"
444 "ssubl2 v11.4s, v11.8h, v16.8h\n"
445 "dup v20.8h, w20\n"
446 "ssubl v19.4s, v13.4h, v20.4h\n"
447 "ssubl2 v13.4s, v13.8h, v20.8h\n"
448 "ssubl v18.4s, v12.4h, v20.4h\n"
449 "ssubl2 v12.4s, v12.8h, v20.8h\n"
450 "ssubl v17.4s, v10.4h, v20.4h\n"
451 "ssubl2 v10.4s, v10.8h, v20.8h\n"
452 "ssubl v16.4s, v9.4h, v20.4h\n"
453 "ssubl2 v9.4s, v9.8h, v20.8h\n"
454 "scvtf v8.4s, v1.4s\n"
455 "scvtf v7.4s, v4.4s\n"
456 "scvtf v6.4s, v23.4s\n"
457 "scvtf v5.4s, v3.4s\n"
458 "scvtf v4.4s, v22.4s\n"
459 "scvtf v3.4s, v2.4s\n"
460 "scvtf v2.4s, v21.4s\n"
461 "scvtf v1.4s, v11.4s\n"
462 "scvtf v19.4s, v19.4s\n"
463 "fmul v8.4s, v8.4s, v0.s[0]\n"
464 "fmla v8.4s, v19.4s, v0.s[1]\n"
465 "scvtf v13.4s, v13.4s\n"
466 "fmul v7.4s, v7.4s, v0.s[0]\n"
467 "fmla v7.4s, v13.4s, v0.s[1]\n"
468 "scvtf v18.4s, v18.4s\n"
469 "fmul v6.4s, v6.4s, v0.s[0]\n"
470 "fmla v6.4s, v18.4s, v0.s[1]\n"
471 "scvtf v12.4s, v12.4s\n"
472 "fmul v5.4s, v5.4s, v0.s[0]\n"
473 "fmla v5.4s, v12.4s, v0.s[1]\n"
474 "scvtf v17.4s, v17.4s\n"
475 "fmul v4.4s, v4.4s, v0.s[0]\n"
476 "fmla v4.4s, v17.4s, v0.s[1]\n"
477 "scvtf v10.4s, v10.4s\n"
478 "fmul v3.4s, v3.4s, v0.s[0]\n"
479 "fmla v3.4s, v10.4s, v0.s[1]\n"
480 "scvtf v16.4s, v16.4s\n"
481 "fmul v2.4s, v2.4s, v0.s[0]\n"
482 "fmla v2.4s, v16.4s, v0.s[1]\n"
483 "scvtf v9.4s, v9.4s\n"
484 "fmul v1.4s, v1.4s, v0.s[0]\n"
485 "fmla v1.4s, v9.4s, v0.s[1]\n"
486 "cbz %x[out_direct], 23f\n"
487 "fmul v23.4s, v8.4s, v0.s[3]\n"
488 "fmul v22.4s, v7.4s, v0.s[3]\n"
489 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
490 "fmul v21.4s, v6.4s, v0.s[3]\n"
491 "fmul v20.4s, v5.4s, v0.s[3]\n"
492 "fmul v17.4s, v4.4s, v0.s[3]\n"
493 "fmul v19.4s, v3.4s, v0.s[3]\n"
494 "fmul v16.4s, v2.4s, v0.s[3]\n"
495 "fmul v18.4s, v1.4s, v0.s[3]\n"
496 "fcvtas v23.4s, v23.4s\n"
497 "fcvtas v22.4s, v22.4s\n"
498 "fcvtas v21.4s, v21.4s\n"
499 "fcvtas v20.4s, v20.4s\n"
500 "fcvtas v17.4s, v17.4s\n"
501 "fcvtas v19.4s, v19.4s\n"
502 "fcvtas v16.4s, v16.4s\n"
503 "fcvtas v18.4s, v18.4s\n"
504 "uzp1 v22.8h, v23.8h, v22.8h\n"
505 "uzp1 v20.8h, v21.8h, v20.8h\n"
506 "uzp1 v19.8h, v17.8h, v19.8h\n"
507 "uzp1 v18.8h, v16.8h, v18.8h\n"
508 "dup v16.8h, w20\n"
509 "add v22.8h, v22.8h, v16.8h\n"
510 "add v20.8h, v20.8h, v16.8h\n"
511 "add v19.8h, v19.8h, v16.8h\n"
512 "add v18.8h, v18.8h, v16.8h\n"
513 "movi v17.8h, #0x7f\n"
514 "mvni v16.8h, #0x7f\n"
515 "smin v22.8h, v22.8h, v17.8h\n"
516 "smin v20.8h, v20.8h, v17.8h\n"
517 "smin v19.8h, v19.8h, v17.8h\n"
518 "smin v18.8h, v18.8h, v17.8h\n"
519 "smax v22.8h, v22.8h, v16.8h\n"
520 "smax v20.8h, v20.8h, v16.8h\n"
521 "smax v19.8h, v19.8h, v16.8h\n"
522 "smax v18.8h, v18.8h, v16.8h\n"
523 "xtn v22.8b, v22.8h\n"
524 "xtn v20.8b, v20.8h\n"
525 "xtn v19.8b, v19.8h\n"
526 "xtn v18.8b, v18.8h\n"
527 "tbz %x[width], #3, 18f\n"
528 "str d22, [x25, #0x0]\n"
529 "add x25, x25, #0x8\n"
530 "str d19, [x22, #0x0]\n"
531 "add x22, x22, #0x8\n"
532 "tbz %x[width], #2, 16f\n"
533 "str s20, [x25], #0x4\n"
534 "str s18, [x22], #0x4\n"
535 "tbz %x[width], #1, 15f\n"
536 "st1 { v20.h }[2], [x25], #0x2\n"
537 "st1 { v18.h }[2], [x22], #0x2\n"
538 "tbz %x[width], #0, 22f\n"
539 "st1 { v20.b }[6], [x25], #0x1\n"
540 "st1 { v18.b }[6], [x22], #0x1\n"
541 "b 22f\n"
542 "15:" // tail loop: Main loop: unique 2: partial_0_12
543 "tbz %x[width], #0, 22f\n"
544 "st1 { v20.b }[4], [x25], #0x1\n"
545 "st1 { v18.b }[4], [x22], #0x1\n"
546 "b 22f\n"
547 "16:" // tail loop: Main loop: unique 2: partial_1_8
548 "tbz %x[width], #1, 17f\n"
549 "str h20, [x25], #0x2\n"
550 "str h18, [x22], #0x2\n"
551 "tbz %x[width], #0, 22f\n"
552 "st1 { v20.b }[2], [x25], #0x1\n"
553 "st1 { v18.b }[2], [x22], #0x1\n"
554 "b 22f\n"
555 "17:" // tail loop: Main loop: unique 2: partial_0_8
556 "tbz %x[width], #0, 22f\n"
557 "str b20, [x25], #0x1\n"
558 "str b18, [x22], #0x1\n"
559 "b 22f\n"
560 "18:" // tail loop: Main loop: unique 2: partial_2_0
561 "tbz %x[width], #2, 20f\n"
562 "str s22, [x25], #0x4\n"
563 "str s19, [x22], #0x4\n"
564 "tbz %x[width], #1, 19f\n"
565 "st1 { v22.h }[2], [x25], #0x2\n"
566 "st1 { v19.h }[2], [x22], #0x2\n"
567 "tbz %x[width], #0, 22f\n"
568 "st1 { v22.b }[6], [x25], #0x1\n"
569 "st1 { v19.b }[6], [x22], #0x1\n"
570 "b 22f\n"
571 "19:" // tail loop: Main loop: unique 2: partial_0_4
572 "tbz %x[width], #0, 22f\n"
573 "st1 { v22.b }[4], [x25], #0x1\n"
574 "st1 { v19.b }[4], [x22], #0x1\n"
575 "b 22f\n"
576 "20:" // tail loop: Main loop: unique 2: partial_1_0
577 "tbz %x[width], #1, 21f\n"
578 "str h22, [x25], #0x2\n"
579 "str h19, [x22], #0x2\n"
580 "tbz %x[width], #0, 22f\n"
581 "st1 { v22.b }[2], [x25], #0x1\n"
582 "st1 { v19.b }[2], [x22], #0x1\n"
583 "b 22f\n"
584 "21:" // tail loop: Main loop: unique 2: partial_0_0
585 "str b22, [x25], #0x1\n"
586 "str b19, [x22], #0x1\n"
587 "22:" // tail loop: Main loop: unique 2: Done
588 "23:" // tail loop: Main loop: No direct output
589 "mov v19.16b, v28.16b\n"
590 "mov v13.16b, v29.16b\n"
591 "fmla v19.4s, v8.4s, v24.4s\n"
592 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
593 "mov v18.16b, v30.16b\n"
594 "mov v12.16b, v31.16b\n"
595 "fmla v13.4s, v7.4s, v25.4s\n"
596 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
597 "mov v17.16b, v28.16b\n"
598 "mov v10.16b, v29.16b\n"
599 "fmla v18.4s, v6.4s, v26.4s\n"
600 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
601 "mov v16.16b, v30.16b\n"
602 "mov v9.16b, v31.16b\n"
603 "fmla v12.4s, v5.4s, v27.4s\n"
604 "fmla v17.4s, v4.4s, v24.4s\n"
605 "fmla v10.4s, v3.4s, v25.4s\n"
606 "fmul v8.4s, v19.4s, v0.s[2]\n"
607 "fmla v16.4s, v2.4s, v26.4s\n"
608 "fmla v9.4s, v1.4s, v27.4s\n"
609 "fmul v7.4s, v13.4s, v0.s[2]\n"
610 "fmul v6.4s, v18.4s, v0.s[2]\n"
611 "fmul v5.4s, v12.4s, v0.s[2]\n"
612 "fmul v4.4s, v17.4s, v0.s[2]\n"
613 "fmul v3.4s, v10.4s, v0.s[2]\n"
614 "fmul v2.4s, v16.4s, v0.s[2]\n"
615 "fmul v1.4s, v9.4s, v0.s[2]\n"
616 "fcvtas v8.4s, v8.4s\n"
617 "fcvtas v7.4s, v7.4s\n"
618 "fcvtas v6.4s, v6.4s\n"
619 "fcvtas v5.4s, v5.4s\n"
620 "fcvtas v4.4s, v4.4s\n"
621 "fcvtas v3.4s, v3.4s\n"
622 "fcvtas v2.4s, v2.4s\n"
623 "fcvtas v1.4s, v1.4s\n"
624 "uzp1 v7.8h, v8.8h, v7.8h\n"
625 "uzp1 v5.8h, v6.8h, v5.8h\n"
626 "uzp1 v3.8h, v4.8h, v3.8h\n"
627 "uzp1 v1.8h, v2.8h, v1.8h\n"
628 "dup v16.8h, w22\n"
629 "add v7.8h, v7.8h, v16.8h\n"
630 "add v5.8h, v5.8h, v16.8h\n"
631 "add v3.8h, v3.8h, v16.8h\n"
632 "add v1.8h, v1.8h, v16.8h\n"
633 "dup v16.8h, w21\n"
634 "smin v7.8h, v7.8h, v16.8h\n"
635 "smin v5.8h, v5.8h, v16.8h\n"
636 "smin v3.8h, v3.8h, v16.8h\n"
637 "smin v1.8h, v1.8h, v16.8h\n"
638 "dup v16.8h, w20\n"
639 "smax v7.8h, v7.8h, v16.8h\n"
640 "smax v5.8h, v5.8h, v16.8h\n"
641 "smax v3.8h, v3.8h, v16.8h\n"
642 "smax v1.8h, v1.8h, v16.8h\n"
643 "xtn v7.8b, v7.8h\n"
644 "xtn v5.8b, v5.8h\n"
645 "xtn v3.8b, v3.8h\n"
646 "xtn v1.8b, v1.8h\n"
647 "tbz %x[width], #3, 27f\n"
648 "str d7, [x26, #0x0]\n"
649 "add x26, x26, #0x8\n"
650 "str d3, [x24, #0x0]\n"
651 "add x24, x24, #0x8\n"
652 "tbz %x[width], #2, 25f\n"
653 "str s5, [x26], #0x4\n"
654 "str s1, [x24], #0x4\n"
655 "tbz %x[width], #1, 24f\n"
656 "st1 { v5.h }[2], [x26], #0x2\n"
657 "st1 { v1.h }[2], [x24], #0x2\n"
658 "tbz %x[width], #0, 31f\n"
659 "st1 { v5.b }[6], [x26], #0x1\n"
660 "st1 { v1.b }[6], [x24], #0x1\n"
661 "b 31f\n"
662 "24:" // tail loop: unique 3: partial_0_12
663 "tbz %x[width], #0, 31f\n"
664 "st1 { v5.b }[4], [x26], #0x1\n"
665 "st1 { v1.b }[4], [x24], #0x1\n"
666 "b 31f\n"
667 "25:" // tail loop: unique 3: partial_1_8
668 "tbz %x[width], #1, 26f\n"
669 "str h5, [x26], #0x2\n"
670 "str h1, [x24], #0x2\n"
671 "tbz %x[width], #0, 31f\n"
672 "st1 { v5.b }[2], [x26], #0x1\n"
673 "st1 { v1.b }[2], [x24], #0x1\n"
674 "b 31f\n"
675 "26:" // tail loop: unique 3: partial_0_8
676 "tbz %x[width], #0, 31f\n"
677 "str b5, [x26], #0x1\n"
678 "str b1, [x24], #0x1\n"
679 "b 31f\n"
680 "27:" // tail loop: unique 3: partial_2_0
681 "tbz %x[width], #2, 29f\n"
682 "str s7, [x26], #0x4\n"
683 "str s3, [x24], #0x4\n"
684 "tbz %x[width], #1, 28f\n"
685 "st1 { v7.h }[2], [x26], #0x2\n"
686 "st1 { v3.h }[2], [x24], #0x2\n"
687 "tbz %x[width], #0, 31f\n"
688 "st1 { v7.b }[6], [x26], #0x1\n"
689 "st1 { v3.b }[6], [x24], #0x1\n"
690 "b 31f\n"
691 "28:" // tail loop: unique 3: partial_0_4
692 "tbz %x[width], #0, 31f\n"
693 "st1 { v7.b }[4], [x26], #0x1\n"
694 "st1 { v3.b }[4], [x24], #0x1\n"
695 "b 31f\n"
696 "29:" // tail loop: unique 3: partial_1_0
697 "tbz %x[width], #1, 30f\n"
698 "str h7, [x26], #0x2\n"
699 "str h3, [x24], #0x2\n"
700 "tbz %x[width], #0, 31f\n"
701 "st1 { v7.b }[2], [x26], #0x1\n"
702 "st1 { v3.b }[2], [x24], #0x1\n"
703 "b 31f\n"
704 "30:" // tail loop: unique 3: partial_0_0
705 "str b7, [x26], #0x1\n"
706 "str b3, [x24], #0x1\n"
707 "31:" // tail loop: unique 3: Done
708 "subs x23, x23, #0x2\n"
709 "bgt 6b\n"
710 "32:" // odd columns skip
711 : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
712 : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
713 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
714}
715
716} // namespace
717
718namespace arm_compute
719{
720namespace cpu
721{
722void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
723 ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
724{
725 ARM_COMPUTE_UNUSED(policy);
726
727 const ITensorInfo *final_output_info = final_output->info();
728 const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr;
729 const ITensorInfo *input1_info = input1->info();
730 const ITensorInfo *input2_info = input2->info();
731
732 const size_t out_stride = final_output_info->strides_in_bytes()[1];
733 const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
734 const size_t in0_stride = input1_info->strides_in_bytes()[1];
735 const size_t in1_stride = input2_info->strides_in_bytes()[1];
736
737 int8_t minval = std::numeric_limits<int8_t>::lowest();
738 int8_t maxval = std::numeric_limits<int8_t>::max();
739
740 const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
741 if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
742 {
743 minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
744 }
745 else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
746 {
747 minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
748 maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
749 }
750 else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
751 {
752 minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
753 maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
754 }
755
756 const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
757 const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
758 const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
759
760 const int32_t in1_offset = in1_qinfo.offset;
761 const int32_t in2_offset = in2_qinfo.offset;
762 const int32_t out_offset = final_output_qinfo.offset;
763 const int32_t out_direct_offset = add_output_qinfo.offset;
764
765 const float in1_scale = in1_qinfo.scale;
766 const float in2_scale = in2_qinfo.scale;
767 const float out_scale = final_output_qinfo.scale;
768 const float out_direct_scale = add_output_qinfo.scale;
769
770 const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer());
771 const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer());
772
773 // Clear X & Y dimensions on execution window as we handle manually
774 Window win = window;
775 win.set(Window::DimX, Window::Dimension(0, 1, 1));
776 win.set(Window::DimY, Window::Dimension(0, 1, 1));
777
778 Iterator in1_it(input1, window);
779 Iterator in2_it(input2, window);
780 Iterator out_it(final_output, window);
781
782 const size_t width = window.num_iterations(0);
783 const size_t height = window.num_iterations(1);
784
785 if(add_output != nullptr)
786 {
787 Iterator add_out_it(add_output, window);
788 execute_window_loop(
789 win, [&](const Coordinates &)
790 {
791 a64_add_bn_clamp_direct_s8_fp32_2x16(
792 reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
793 reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
794 reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
795 reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
796 bn_mul_buffer,
797 bn_add_buffer,
798 minval,
799 maxval,
800 out_offset, out_scale,
801 out_direct_offset, out_direct_scale,
802 in1_offset, in1_scale,
803 in2_offset, in2_scale,
804 width, height);
805 },
806 in1_it, in2_it, add_out_it, out_it);
807 }
808 else
809 {
810 execute_window_loop(
811 win, [&](const Coordinates &)
812 {
813 a64_add_bn_clamp_direct_s8_fp32_2x16(
814 reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
815 nullptr, out_direct_stride,
816 reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
817 reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
818 bn_mul_buffer,
819 bn_add_buffer,
820 minval,
821 maxval,
822 out_offset, out_scale,
823 out_direct_offset, out_direct_scale,
824 in1_offset, in1_scale,
825 in2_offset, in2_scale,
826 width, height);
827 },
828 in1_it, in2_it, out_it);
829 }
830}
831} // namespace cpu
832} // namespace arm_compute
833
834#endif // __aarch64__