blob: e1a45b467b7f6be39cec7bcc2b7d882ef2fe348f [file] [log] [blame]
Gunes Bayirae72a462023-01-29 13:24:24 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
Gunes Bayirae72a462023-01-29 13:24:24 +000025#include "arm_compute/core/Helpers.h"
26#include "arm_compute/core/ITensor.h"
27#include "arm_compute/core/QuantizationInfo.h"
28#include "arm_compute/core/Types.h"
29#include "arm_compute/core/Window.h"
SiCong Li91295492023-07-21 18:16:13 +010030#include "arm_compute/function_info/ActivationLayerInfo.h"
Gunes Bayirae72a462023-01-29 13:24:24 +000031
32#include <cstddef>
33#include <cstdint>
34#include <limits>
35
36#ifdef __aarch64__
37namespace
38{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010039void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out,
40 size_t out_stride,
41 int8_t *out_direct,
42 size_t out_direct_stride,
43 const int8_t *in0,
44 size_t in0_stride,
45 const int8_t *in1,
46 size_t in1_stride,
47 const float *bn_mul,
48 const float *bn_add,
49 const int8_t minval,
50 const int8_t maxval,
51 int32_t out_zeropt,
52 float out_scale,
53 int32_t out_direct_zeropt,
54 float out_direct_scale,
55 int32_t in0_zeropt,
56 float in0_scale,
57 int32_t in1_zeropt,
58 float in1_scale,
59 size_t width,
60 size_t height)
Gunes Bayirae72a462023-01-29 13:24:24 +000061{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010062 float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
Gunes Bayirae72a462023-01-29 13:24:24 +000063 struct KernelArgs
64 {
65 const float *scales;
66 int32_t in0_zeropt;
67 int32_t in1_zeropt;
68 int32_t out_zeropt;
69 int32_t out_direct_zeropt;
70 int32_t minval;
71 int32_t maxval;
72 } ka;
73 ka.scales = scales;
74 ka.in0_zeropt = in0_zeropt;
75 ka.in1_zeropt = in1_zeropt;
76 ka.out_zeropt = out_zeropt;
77 ka.out_direct_zeropt = out_direct_zeropt;
78 ka.minval = minval;
79 ka.maxval = maxval;
80
81 __asm__ __volatile__(
82 "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
83 "ld1 { v0.4s }, [x20]\n"
84 "cmp %x[width], #0x10\n"
85 "blt 5f\n"
86 "1:" // Column loop
87 "ldr q24, [%x[bn_mul], #0x0]\n"
88 "ldr q25, [%x[bn_mul], #0x10]\n"
89 "mov x23, %x[height]\n"
90 "mov x12, %x[in0]\n"
91 "ldr q26, [%x[bn_mul], #0x20]\n"
92 "ldr q27, [%x[bn_mul], #0x30]\n"
93 "mov x11, %x[in1]\n"
94 "mov x10, %x[out]\n"
95 "ldr q28, [%x[bn_add], #0x0]\n"
96 "ldr q29, [%x[bn_add], #0x10]\n"
97 "mov x9, %x[out_direct]\n"
98 "add %x[bn_mul], %x[bn_mul], #0x40\n"
99 "ldr q30, [%x[bn_add], #0x20]\n"
100 "ldr q31, [%x[bn_add], #0x30]\n"
101 "add %x[bn_add], %x[bn_add], #0x40\n"
102 "2:" // Row loop
103 "mov x28, x12\n"
104 "ldr d4, [x28, #0x0]\n"
105 "ldr d3, [x28, #0x8]\n"
106 "add x21, x28, %x[in0_stride]\n"
107 "mov x27, x11\n"
108 "ldr d13, [x27, #0x0]\n"
109 "ldr d12, [x27, #0x8]\n"
110 "cmp x23, #0x2\n"
111 "add x12, x21, %x[in0_stride]\n"
112 "csel x21, x21, x28, GE\n"
113 "ldr d2, [x21, #0x0]\n"
114 "ldr d11, [x21, #0x8]\n"
115 "add x20, x27, %x[in1_stride]\n"
116 "add x11, x20, %x[in1_stride]\n"
117 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
118 "sshll v4.8h, v4.8b, #0x0\n"
119 "csel x20, x20, x27, GE\n"
120 "ldr d10, [x20, #0x0]\n"
121 "ldr d9, [x20, #0x8]\n"
122 "sshll v3.8h, v3.8b, #0x0\n"
123 "sshll v2.8h, v2.8b, #0x0\n"
124 "sshll v11.8h, v11.8b, #0x0\n"
125 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
126 "mov x26, x10\n"
127 "dup v16.8h, w21\n"
128 "sshll v13.8h, v13.8b, #0x0\n"
129 "mov x25, x9\n"
130 "add x24, x26, %x[out_stride]\n"
131 "sshll v12.8h, v12.8b, #0x0\n"
132 "sshll v10.8h, v10.8b, #0x0\n"
133 "add x22, x25, %x[out_direct_stride]\n"
134 "add x10, x24, %x[out_stride]\n"
135 "sshll v9.8h, v9.8b, #0x0\n"
136 "ssubl v1.4s, v4.4h, v16.4h\n"
137 "add x9, x22, %x[out_direct_stride]\n"
138 "csel x24, x24, x26, GE\n"
139 "ssubl2 v4.4s, v4.8h, v16.8h\n"
140 "ssubl v23.4s, v3.4h, v16.4h\n"
141 "csel x22, x22, x25, GE\n"
142 "ssubl2 v3.4s, v3.8h, v16.8h\n"
143 "ssubl v22.4s, v2.4h, v16.4h\n"
144 "ssubl2 v2.4s, v2.8h, v16.8h\n"
145 "ssubl v21.4s, v11.4h, v16.4h\n"
146 "ssubl2 v11.4s, v11.8h, v16.8h\n"
147 "dup v20.8h, w20\n"
148 "ssubl v19.4s, v13.4h, v20.4h\n"
149 "ssubl2 v13.4s, v13.8h, v20.8h\n"
150 "ssubl v18.4s, v12.4h, v20.4h\n"
151 "ssubl2 v12.4s, v12.8h, v20.8h\n"
152 "ssubl v17.4s, v10.4h, v20.4h\n"
153 "ssubl2 v10.4s, v10.8h, v20.8h\n"
154 "ssubl v16.4s, v9.4h, v20.4h\n"
155 "ssubl2 v9.4s, v9.8h, v20.8h\n"
156 "scvtf v8.4s, v1.4s\n"
157 "scvtf v7.4s, v4.4s\n"
158 "scvtf v6.4s, v23.4s\n"
159 "scvtf v5.4s, v3.4s\n"
160 "scvtf v4.4s, v22.4s\n"
161 "scvtf v3.4s, v2.4s\n"
162 "scvtf v2.4s, v21.4s\n"
163 "scvtf v1.4s, v11.4s\n"
164 "scvtf v19.4s, v19.4s\n"
165 "fmul v8.4s, v8.4s, v0.s[0]\n"
166 "fmla v8.4s, v19.4s, v0.s[1]\n"
167 "scvtf v13.4s, v13.4s\n"
168 "fmul v7.4s, v7.4s, v0.s[0]\n"
169 "fmla v7.4s, v13.4s, v0.s[1]\n"
170 "scvtf v18.4s, v18.4s\n"
171 "fmul v6.4s, v6.4s, v0.s[0]\n"
172 "fmla v6.4s, v18.4s, v0.s[1]\n"
173 "scvtf v12.4s, v12.4s\n"
174 "fmul v5.4s, v5.4s, v0.s[0]\n"
175 "fmla v5.4s, v12.4s, v0.s[1]\n"
176 "scvtf v17.4s, v17.4s\n"
177 "fmul v4.4s, v4.4s, v0.s[0]\n"
178 "fmla v4.4s, v17.4s, v0.s[1]\n"
179 "scvtf v10.4s, v10.4s\n"
180 "fmul v3.4s, v3.4s, v0.s[0]\n"
181 "fmla v3.4s, v10.4s, v0.s[1]\n"
182 "scvtf v16.4s, v16.4s\n"
183 "fmul v2.4s, v2.4s, v0.s[0]\n"
184 "fmla v2.4s, v16.4s, v0.s[1]\n"
185 "scvtf v9.4s, v9.4s\n"
186 "fmul v1.4s, v1.4s, v0.s[0]\n"
187 "fmla v1.4s, v9.4s, v0.s[1]\n"
188 "cbz %x[out_direct], 3f\n"
189 "fmul v23.4s, v8.4s, v0.s[3]\n"
190 "fmul v22.4s, v7.4s, v0.s[3]\n"
191 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
192 "fmul v21.4s, v6.4s, v0.s[3]\n"
193 "fmul v20.4s, v5.4s, v0.s[3]\n"
194 "fmul v17.4s, v4.4s, v0.s[3]\n"
195 "fmul v19.4s, v3.4s, v0.s[3]\n"
196 "fmul v16.4s, v2.4s, v0.s[3]\n"
197 "fmul v18.4s, v1.4s, v0.s[3]\n"
198 "fcvtas v23.4s, v23.4s\n"
199 "fcvtas v22.4s, v22.4s\n"
200 "fcvtas v21.4s, v21.4s\n"
201 "fcvtas v20.4s, v20.4s\n"
202 "fcvtas v17.4s, v17.4s\n"
203 "fcvtas v19.4s, v19.4s\n"
204 "fcvtas v16.4s, v16.4s\n"
205 "fcvtas v18.4s, v18.4s\n"
206 "uzp1 v22.8h, v23.8h, v22.8h\n"
207 "uzp1 v20.8h, v21.8h, v20.8h\n"
208 "uzp1 v19.8h, v17.8h, v19.8h\n"
209 "uzp1 v18.8h, v16.8h, v18.8h\n"
210 "dup v16.8h, w20\n"
211 "add v22.8h, v22.8h, v16.8h\n"
212 "add v20.8h, v20.8h, v16.8h\n"
213 "add v19.8h, v19.8h, v16.8h\n"
214 "add v18.8h, v18.8h, v16.8h\n"
215 "movi v17.8h, #0x7f\n"
216 "mvni v16.8h, #0x7f\n"
217 "smin v22.8h, v22.8h, v17.8h\n"
218 "smin v20.8h, v20.8h, v17.8h\n"
219 "smin v19.8h, v19.8h, v17.8h\n"
220 "smin v18.8h, v18.8h, v17.8h\n"
221 "smax v22.8h, v22.8h, v16.8h\n"
222 "smax v20.8h, v20.8h, v16.8h\n"
223 "smax v19.8h, v19.8h, v16.8h\n"
224 "smax v18.8h, v18.8h, v16.8h\n"
225 "xtn v22.8b, v22.8h\n"
226 "str d22, [x25, #0x0]\n"
227 "xtn v20.8b, v20.8h\n"
228 "xtn v19.8b, v19.8h\n"
229 "str d20, [x25, #0x8]\n"
230 "xtn v18.8b, v18.8h\n"
231 "str d19, [x22, #0x0]\n"
232 "str d18, [x22, #0x8]\n"
233 "3:" // Main loop: No direct output
234 "mov v19.16b, v28.16b\n"
235 "mov v13.16b, v29.16b\n"
236 "fmla v19.4s, v8.4s, v24.4s\n"
237 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
238 "mov v18.16b, v30.16b\n"
239 "mov v12.16b, v31.16b\n"
240 "fmla v13.4s, v7.4s, v25.4s\n"
241 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
242 "mov v17.16b, v28.16b\n"
243 "mov v10.16b, v29.16b\n"
244 "fmla v18.4s, v6.4s, v26.4s\n"
245 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
246 "mov v16.16b, v30.16b\n"
247 "mov v9.16b, v31.16b\n"
248 "fmla v12.4s, v5.4s, v27.4s\n"
249 "subs x23, x23, #0x2\n"
250 "fmla v17.4s, v4.4s, v24.4s\n"
251 "fmla v10.4s, v3.4s, v25.4s\n"
252 "fmul v8.4s, v19.4s, v0.s[2]\n"
253 "fmla v16.4s, v2.4s, v26.4s\n"
254 "fmla v9.4s, v1.4s, v27.4s\n"
255 "fmul v7.4s, v13.4s, v0.s[2]\n"
256 "fmul v6.4s, v18.4s, v0.s[2]\n"
257 "fmul v5.4s, v12.4s, v0.s[2]\n"
258 "fmul v4.4s, v17.4s, v0.s[2]\n"
259 "fmul v3.4s, v10.4s, v0.s[2]\n"
260 "fmul v2.4s, v16.4s, v0.s[2]\n"
261 "fmul v1.4s, v9.4s, v0.s[2]\n"
262 "fcvtas v8.4s, v8.4s\n"
263 "fcvtas v7.4s, v7.4s\n"
264 "fcvtas v6.4s, v6.4s\n"
265 "fcvtas v5.4s, v5.4s\n"
266 "fcvtas v4.4s, v4.4s\n"
267 "fcvtas v3.4s, v3.4s\n"
268 "fcvtas v2.4s, v2.4s\n"
269 "fcvtas v1.4s, v1.4s\n"
270 "uzp1 v7.8h, v8.8h, v7.8h\n"
271 "uzp1 v5.8h, v6.8h, v5.8h\n"
272 "uzp1 v3.8h, v4.8h, v3.8h\n"
273 "uzp1 v1.8h, v2.8h, v1.8h\n"
274 "dup v16.8h, w22\n"
275 "add v7.8h, v7.8h, v16.8h\n"
276 "add v5.8h, v5.8h, v16.8h\n"
277 "add v3.8h, v3.8h, v16.8h\n"
278 "add v1.8h, v1.8h, v16.8h\n"
279 "dup v16.8h, w21\n"
280 "smin v7.8h, v7.8h, v16.8h\n"
281 "smin v5.8h, v5.8h, v16.8h\n"
282 "smin v3.8h, v3.8h, v16.8h\n"
283 "smin v1.8h, v1.8h, v16.8h\n"
284 "dup v16.8h, w20\n"
285 "smax v7.8h, v7.8h, v16.8h\n"
286 "smax v5.8h, v5.8h, v16.8h\n"
287 "smax v3.8h, v3.8h, v16.8h\n"
288 "smax v1.8h, v1.8h, v16.8h\n"
289 "xtn v7.8b, v7.8h\n"
290 "str d7, [x26, #0x0]\n"
291 "xtn v5.8b, v5.8h\n"
292 "xtn v3.8b, v3.8h\n"
293 "str d5, [x26, #0x8]\n"
294 "xtn v1.8b, v1.8h\n"
295 "str d3, [x24, #0x0]\n"
296 "str d1, [x24, #0x8]\n"
297 "bgt 2b\n"
298 "add %x[in0], %x[in0], #0x10\n"
299 "add %x[in1], %x[in1], #0x10\n"
300 "add %x[out], %x[out], #0x10\n"
301 "cbz %x[out_direct], 4f\n"
302 "add %x[out_direct], %x[out_direct], #0x10\n"
303 "4:" // No direct pointer update
304 "sub %x[width], %x[width], #0x10\n"
305 "cmp %x[width], #0x10\n"
306 "bge 1b\n"
307 "cbz %x[width], 32f\n"
308 "5:" // main loop skip
309 "ldr q24, [%x[bn_mul], #0x0]\n"
310 "ldr q25, [%x[bn_mul], #0x10]\n"
311 "mov x23, %x[height]\n"
312 "mov x12, %x[in0]\n"
313 "ldr q26, [%x[bn_mul], #0x20]\n"
314 "ldr q27, [%x[bn_mul], #0x30]\n"
315 "mov x11, %x[in1]\n"
316 "mov x10, %x[out]\n"
317 "ldr q28, [%x[bn_add], #0x0]\n"
318 "ldr q29, [%x[bn_add], #0x10]\n"
319 "mov x9, %x[out_direct]\n"
320 "add %x[bn_mul], %x[bn_mul], #0x40\n"
321 "ldr q30, [%x[bn_add], #0x20]\n"
322 "ldr q31, [%x[bn_add], #0x30]\n"
323 "add %x[bn_add], %x[bn_add], #0x40\n"
324 "6:" // tail loop: Row loop
325 "mov x28, x12\n"
326 "mov x27, x11\n"
327 "mov x26, x10\n"
328 "mov x25, x9\n"
329 "add x21, x28, %x[in0_stride]\n"
330 "add x20, x27, %x[in1_stride]\n"
331 "add x24, x26, %x[out_stride]\n"
332 "add x22, x25, %x[out_direct_stride]\n"
333 "cmp x23, #0x2\n"
334 "add x12, x21, %x[in0_stride]\n"
335 "add x11, x20, %x[in1_stride]\n"
336 "add x10, x24, %x[out_stride]\n"
337 "add x9, x22, %x[out_direct_stride]\n"
338 "csel x21, x21, x28, GE\n"
339 "csel x20, x20, x27, GE\n"
340 "csel x24, x24, x26, GE\n"
341 "csel x22, x22, x25, GE\n"
342 "tbz %x[width], #3, 10f\n"
343 "ldr d4, [x28, #0x0]\n"
344 "ldr d13, [x27, #0x0]\n"
345 "add x28, x28, #0x8\n"
346 "add x27, x27, #0x8\n"
347 "ldr d2, [x21, #0x0]\n"
348 "ldr d10, [x20, #0x0]\n"
349 "add x21, x21, #0x8\n"
350 "add x20, x20, #0x8\n"
351 "tbz %x[width], #2, 8f\n"
352 "ldr s3, [x28], #0x4\n"
353 "ldr s12, [x27], #0x4\n"
354 "ldr s11, [x21], #0x4\n"
355 "ldr s9, [x20], #0x4\n"
356 "tbz %x[width], #1, 7f\n"
357 "ld1 { v3.h }[2], [x28], #0x2\n"
358 "ld1 { v12.h }[2], [x27], #0x2\n"
359 "ld1 { v11.h }[2], [x21], #0x2\n"
360 "ld1 { v9.h }[2], [x20], #0x2\n"
361 "tbz %x[width], #0, 14f\n"
362 "ld1 { v3.b }[6], [x28], #0x1\n"
363 "ld1 { v12.b }[6], [x27], #0x1\n"
364 "ld1 { v11.b }[6], [x21], #0x1\n"
365 "ld1 { v9.b }[6], [x20], #0x1\n"
366 "b 14f\n"
367 "7:" // tail loop: unique 1: partial_0_12
368 "tbz %x[width], #0, 14f\n"
369 "ld1 { v3.b }[4], [x28], #0x1\n"
370 "ld1 { v12.b }[4], [x27], #0x1\n"
371 "ld1 { v11.b }[4], [x21], #0x1\n"
372 "ld1 { v9.b }[4], [x20], #0x1\n"
373 "b 14f\n"
374 "8:" // tail loop: unique 1: partial_1_8
375 "tbz %x[width], #1, 9f\n"
376 "ldr h3, [x28], #0x2\n"
377 "ldr h12, [x27], #0x2\n"
378 "ldr h11, [x21], #0x2\n"
379 "ldr h9, [x20], #0x2\n"
380 "tbz %x[width], #0, 14f\n"
381 "ld1 { v3.b }[2], [x28], #0x1\n"
382 "ld1 { v12.b }[2], [x27], #0x1\n"
383 "ld1 { v11.b }[2], [x21], #0x1\n"
384 "ld1 { v9.b }[2], [x20], #0x1\n"
385 "b 14f\n"
386 "9:" // tail loop: unique 1: partial_0_8
387 "tbz %x[width], #0, 14f\n"
388 "ldr b3, [x28], #0x1\n"
389 "ldr b12, [x27], #0x1\n"
390 "ldr b11, [x21], #0x1\n"
391 "ldr b9, [x20], #0x1\n"
392 "b 14f\n"
393 "10:" // tail loop: unique 1: partial_2_0
394 "tbz %x[width], #2, 12f\n"
395 "ldr s4, [x28], #0x4\n"
396 "ldr s13, [x27], #0x4\n"
397 "ldr s2, [x21], #0x4\n"
398 "ldr s10, [x20], #0x4\n"
399 "tbz %x[width], #1, 11f\n"
400 "ld1 { v4.h }[2], [x28], #0x2\n"
401 "ld1 { v13.h }[2], [x27], #0x2\n"
402 "ld1 { v2.h }[2], [x21], #0x2\n"
403 "ld1 { v10.h }[2], [x20], #0x2\n"
404 "tbz %x[width], #0, 14f\n"
405 "ld1 { v4.b }[6], [x28], #0x1\n"
406 "ld1 { v13.b }[6], [x27], #0x1\n"
407 "ld1 { v2.b }[6], [x21], #0x1\n"
408 "ld1 { v10.b }[6], [x20], #0x1\n"
409 "b 14f\n"
410 "11:" // tail loop: unique 1: partial_0_4
411 "tbz %x[width], #0, 14f\n"
412 "ld1 { v4.b }[4], [x28], #0x1\n"
413 "ld1 { v13.b }[4], [x27], #0x1\n"
414 "ld1 { v2.b }[4], [x21], #0x1\n"
415 "ld1 { v10.b }[4], [x20], #0x1\n"
416 "b 14f\n"
417 "12:" // tail loop: unique 1: partial_1_0
418 "tbz %x[width], #1, 13f\n"
419 "ldr h4, [x28], #0x2\n"
420 "ldr h13, [x27], #0x2\n"
421 "ldr h2, [x21], #0x2\n"
422 "ldr h10, [x20], #0x2\n"
423 "tbz %x[width], #0, 14f\n"
424 "ld1 { v4.b }[2], [x28], #0x1\n"
425 "ld1 { v13.b }[2], [x27], #0x1\n"
426 "ld1 { v2.b }[2], [x21], #0x1\n"
427 "ld1 { v10.b }[2], [x20], #0x1\n"
428 "b 14f\n"
429 "13:" // tail loop: unique 1: partial_0_0
430 "ldr b4, [x28], #0x1\n"
431 "ldr b13, [x27], #0x1\n"
432 "ldr b2, [x21], #0x1\n"
433 "ldr b10, [x20], #0x1\n"
434 "14:" // tail loop: unique 1: Done
435 "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
436 "sshll v4.8h, v4.8b, #0x0\n"
437 "sshll v3.8h, v3.8b, #0x0\n"
438 "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
439 "sshll v2.8h, v2.8b, #0x0\n"
440 "sshll v11.8h, v11.8b, #0x0\n"
441 "dup v16.8h, w21\n"
442 "sshll v13.8h, v13.8b, #0x0\n"
443 "sshll v12.8h, v12.8b, #0x0\n"
444 "sshll v10.8h, v10.8b, #0x0\n"
445 "sshll v9.8h, v9.8b, #0x0\n"
446 "ssubl v1.4s, v4.4h, v16.4h\n"
447 "ssubl2 v4.4s, v4.8h, v16.8h\n"
448 "ssubl v23.4s, v3.4h, v16.4h\n"
449 "ssubl2 v3.4s, v3.8h, v16.8h\n"
450 "ssubl v22.4s, v2.4h, v16.4h\n"
451 "ssubl2 v2.4s, v2.8h, v16.8h\n"
452 "ssubl v21.4s, v11.4h, v16.4h\n"
453 "ssubl2 v11.4s, v11.8h, v16.8h\n"
454 "dup v20.8h, w20\n"
455 "ssubl v19.4s, v13.4h, v20.4h\n"
456 "ssubl2 v13.4s, v13.8h, v20.8h\n"
457 "ssubl v18.4s, v12.4h, v20.4h\n"
458 "ssubl2 v12.4s, v12.8h, v20.8h\n"
459 "ssubl v17.4s, v10.4h, v20.4h\n"
460 "ssubl2 v10.4s, v10.8h, v20.8h\n"
461 "ssubl v16.4s, v9.4h, v20.4h\n"
462 "ssubl2 v9.4s, v9.8h, v20.8h\n"
463 "scvtf v8.4s, v1.4s\n"
464 "scvtf v7.4s, v4.4s\n"
465 "scvtf v6.4s, v23.4s\n"
466 "scvtf v5.4s, v3.4s\n"
467 "scvtf v4.4s, v22.4s\n"
468 "scvtf v3.4s, v2.4s\n"
469 "scvtf v2.4s, v21.4s\n"
470 "scvtf v1.4s, v11.4s\n"
471 "scvtf v19.4s, v19.4s\n"
472 "fmul v8.4s, v8.4s, v0.s[0]\n"
473 "fmla v8.4s, v19.4s, v0.s[1]\n"
474 "scvtf v13.4s, v13.4s\n"
475 "fmul v7.4s, v7.4s, v0.s[0]\n"
476 "fmla v7.4s, v13.4s, v0.s[1]\n"
477 "scvtf v18.4s, v18.4s\n"
478 "fmul v6.4s, v6.4s, v0.s[0]\n"
479 "fmla v6.4s, v18.4s, v0.s[1]\n"
480 "scvtf v12.4s, v12.4s\n"
481 "fmul v5.4s, v5.4s, v0.s[0]\n"
482 "fmla v5.4s, v12.4s, v0.s[1]\n"
483 "scvtf v17.4s, v17.4s\n"
484 "fmul v4.4s, v4.4s, v0.s[0]\n"
485 "fmla v4.4s, v17.4s, v0.s[1]\n"
486 "scvtf v10.4s, v10.4s\n"
487 "fmul v3.4s, v3.4s, v0.s[0]\n"
488 "fmla v3.4s, v10.4s, v0.s[1]\n"
489 "scvtf v16.4s, v16.4s\n"
490 "fmul v2.4s, v2.4s, v0.s[0]\n"
491 "fmla v2.4s, v16.4s, v0.s[1]\n"
492 "scvtf v9.4s, v9.4s\n"
493 "fmul v1.4s, v1.4s, v0.s[0]\n"
494 "fmla v1.4s, v9.4s, v0.s[1]\n"
495 "cbz %x[out_direct], 23f\n"
496 "fmul v23.4s, v8.4s, v0.s[3]\n"
497 "fmul v22.4s, v7.4s, v0.s[3]\n"
498 "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
499 "fmul v21.4s, v6.4s, v0.s[3]\n"
500 "fmul v20.4s, v5.4s, v0.s[3]\n"
501 "fmul v17.4s, v4.4s, v0.s[3]\n"
502 "fmul v19.4s, v3.4s, v0.s[3]\n"
503 "fmul v16.4s, v2.4s, v0.s[3]\n"
504 "fmul v18.4s, v1.4s, v0.s[3]\n"
505 "fcvtas v23.4s, v23.4s\n"
506 "fcvtas v22.4s, v22.4s\n"
507 "fcvtas v21.4s, v21.4s\n"
508 "fcvtas v20.4s, v20.4s\n"
509 "fcvtas v17.4s, v17.4s\n"
510 "fcvtas v19.4s, v19.4s\n"
511 "fcvtas v16.4s, v16.4s\n"
512 "fcvtas v18.4s, v18.4s\n"
513 "uzp1 v22.8h, v23.8h, v22.8h\n"
514 "uzp1 v20.8h, v21.8h, v20.8h\n"
515 "uzp1 v19.8h, v17.8h, v19.8h\n"
516 "uzp1 v18.8h, v16.8h, v18.8h\n"
517 "dup v16.8h, w20\n"
518 "add v22.8h, v22.8h, v16.8h\n"
519 "add v20.8h, v20.8h, v16.8h\n"
520 "add v19.8h, v19.8h, v16.8h\n"
521 "add v18.8h, v18.8h, v16.8h\n"
522 "movi v17.8h, #0x7f\n"
523 "mvni v16.8h, #0x7f\n"
524 "smin v22.8h, v22.8h, v17.8h\n"
525 "smin v20.8h, v20.8h, v17.8h\n"
526 "smin v19.8h, v19.8h, v17.8h\n"
527 "smin v18.8h, v18.8h, v17.8h\n"
528 "smax v22.8h, v22.8h, v16.8h\n"
529 "smax v20.8h, v20.8h, v16.8h\n"
530 "smax v19.8h, v19.8h, v16.8h\n"
531 "smax v18.8h, v18.8h, v16.8h\n"
532 "xtn v22.8b, v22.8h\n"
533 "xtn v20.8b, v20.8h\n"
534 "xtn v19.8b, v19.8h\n"
535 "xtn v18.8b, v18.8h\n"
536 "tbz %x[width], #3, 18f\n"
537 "str d22, [x25, #0x0]\n"
538 "add x25, x25, #0x8\n"
539 "str d19, [x22, #0x0]\n"
540 "add x22, x22, #0x8\n"
541 "tbz %x[width], #2, 16f\n"
542 "str s20, [x25], #0x4\n"
543 "str s18, [x22], #0x4\n"
544 "tbz %x[width], #1, 15f\n"
545 "st1 { v20.h }[2], [x25], #0x2\n"
546 "st1 { v18.h }[2], [x22], #0x2\n"
547 "tbz %x[width], #0, 22f\n"
548 "st1 { v20.b }[6], [x25], #0x1\n"
549 "st1 { v18.b }[6], [x22], #0x1\n"
550 "b 22f\n"
551 "15:" // tail loop: Main loop: unique 2: partial_0_12
552 "tbz %x[width], #0, 22f\n"
553 "st1 { v20.b }[4], [x25], #0x1\n"
554 "st1 { v18.b }[4], [x22], #0x1\n"
555 "b 22f\n"
556 "16:" // tail loop: Main loop: unique 2: partial_1_8
557 "tbz %x[width], #1, 17f\n"
558 "str h20, [x25], #0x2\n"
559 "str h18, [x22], #0x2\n"
560 "tbz %x[width], #0, 22f\n"
561 "st1 { v20.b }[2], [x25], #0x1\n"
562 "st1 { v18.b }[2], [x22], #0x1\n"
563 "b 22f\n"
564 "17:" // tail loop: Main loop: unique 2: partial_0_8
565 "tbz %x[width], #0, 22f\n"
566 "str b20, [x25], #0x1\n"
567 "str b18, [x22], #0x1\n"
568 "b 22f\n"
569 "18:" // tail loop: Main loop: unique 2: partial_2_0
570 "tbz %x[width], #2, 20f\n"
571 "str s22, [x25], #0x4\n"
572 "str s19, [x22], #0x4\n"
573 "tbz %x[width], #1, 19f\n"
574 "st1 { v22.h }[2], [x25], #0x2\n"
575 "st1 { v19.h }[2], [x22], #0x2\n"
576 "tbz %x[width], #0, 22f\n"
577 "st1 { v22.b }[6], [x25], #0x1\n"
578 "st1 { v19.b }[6], [x22], #0x1\n"
579 "b 22f\n"
580 "19:" // tail loop: Main loop: unique 2: partial_0_4
581 "tbz %x[width], #0, 22f\n"
582 "st1 { v22.b }[4], [x25], #0x1\n"
583 "st1 { v19.b }[4], [x22], #0x1\n"
584 "b 22f\n"
585 "20:" // tail loop: Main loop: unique 2: partial_1_0
586 "tbz %x[width], #1, 21f\n"
587 "str h22, [x25], #0x2\n"
588 "str h19, [x22], #0x2\n"
589 "tbz %x[width], #0, 22f\n"
590 "st1 { v22.b }[2], [x25], #0x1\n"
591 "st1 { v19.b }[2], [x22], #0x1\n"
592 "b 22f\n"
593 "21:" // tail loop: Main loop: unique 2: partial_0_0
594 "str b22, [x25], #0x1\n"
595 "str b19, [x22], #0x1\n"
596 "22:" // tail loop: Main loop: unique 2: Done
597 "23:" // tail loop: Main loop: No direct output
598 "mov v19.16b, v28.16b\n"
599 "mov v13.16b, v29.16b\n"
600 "fmla v19.4s, v8.4s, v24.4s\n"
601 "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
602 "mov v18.16b, v30.16b\n"
603 "mov v12.16b, v31.16b\n"
604 "fmla v13.4s, v7.4s, v25.4s\n"
605 "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
606 "mov v17.16b, v28.16b\n"
607 "mov v10.16b, v29.16b\n"
608 "fmla v18.4s, v6.4s, v26.4s\n"
609 "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
610 "mov v16.16b, v30.16b\n"
611 "mov v9.16b, v31.16b\n"
612 "fmla v12.4s, v5.4s, v27.4s\n"
613 "fmla v17.4s, v4.4s, v24.4s\n"
614 "fmla v10.4s, v3.4s, v25.4s\n"
615 "fmul v8.4s, v19.4s, v0.s[2]\n"
616 "fmla v16.4s, v2.4s, v26.4s\n"
617 "fmla v9.4s, v1.4s, v27.4s\n"
618 "fmul v7.4s, v13.4s, v0.s[2]\n"
619 "fmul v6.4s, v18.4s, v0.s[2]\n"
620 "fmul v5.4s, v12.4s, v0.s[2]\n"
621 "fmul v4.4s, v17.4s, v0.s[2]\n"
622 "fmul v3.4s, v10.4s, v0.s[2]\n"
623 "fmul v2.4s, v16.4s, v0.s[2]\n"
624 "fmul v1.4s, v9.4s, v0.s[2]\n"
625 "fcvtas v8.4s, v8.4s\n"
626 "fcvtas v7.4s, v7.4s\n"
627 "fcvtas v6.4s, v6.4s\n"
628 "fcvtas v5.4s, v5.4s\n"
629 "fcvtas v4.4s, v4.4s\n"
630 "fcvtas v3.4s, v3.4s\n"
631 "fcvtas v2.4s, v2.4s\n"
632 "fcvtas v1.4s, v1.4s\n"
633 "uzp1 v7.8h, v8.8h, v7.8h\n"
634 "uzp1 v5.8h, v6.8h, v5.8h\n"
635 "uzp1 v3.8h, v4.8h, v3.8h\n"
636 "uzp1 v1.8h, v2.8h, v1.8h\n"
637 "dup v16.8h, w22\n"
638 "add v7.8h, v7.8h, v16.8h\n"
639 "add v5.8h, v5.8h, v16.8h\n"
640 "add v3.8h, v3.8h, v16.8h\n"
641 "add v1.8h, v1.8h, v16.8h\n"
642 "dup v16.8h, w21\n"
643 "smin v7.8h, v7.8h, v16.8h\n"
644 "smin v5.8h, v5.8h, v16.8h\n"
645 "smin v3.8h, v3.8h, v16.8h\n"
646 "smin v1.8h, v1.8h, v16.8h\n"
647 "dup v16.8h, w20\n"
648 "smax v7.8h, v7.8h, v16.8h\n"
649 "smax v5.8h, v5.8h, v16.8h\n"
650 "smax v3.8h, v3.8h, v16.8h\n"
651 "smax v1.8h, v1.8h, v16.8h\n"
652 "xtn v7.8b, v7.8h\n"
653 "xtn v5.8b, v5.8h\n"
654 "xtn v3.8b, v3.8h\n"
655 "xtn v1.8b, v1.8h\n"
656 "tbz %x[width], #3, 27f\n"
657 "str d7, [x26, #0x0]\n"
658 "add x26, x26, #0x8\n"
659 "str d3, [x24, #0x0]\n"
660 "add x24, x24, #0x8\n"
661 "tbz %x[width], #2, 25f\n"
662 "str s5, [x26], #0x4\n"
663 "str s1, [x24], #0x4\n"
664 "tbz %x[width], #1, 24f\n"
665 "st1 { v5.h }[2], [x26], #0x2\n"
666 "st1 { v1.h }[2], [x24], #0x2\n"
667 "tbz %x[width], #0, 31f\n"
668 "st1 { v5.b }[6], [x26], #0x1\n"
669 "st1 { v1.b }[6], [x24], #0x1\n"
670 "b 31f\n"
671 "24:" // tail loop: unique 3: partial_0_12
672 "tbz %x[width], #0, 31f\n"
673 "st1 { v5.b }[4], [x26], #0x1\n"
674 "st1 { v1.b }[4], [x24], #0x1\n"
675 "b 31f\n"
676 "25:" // tail loop: unique 3: partial_1_8
677 "tbz %x[width], #1, 26f\n"
678 "str h5, [x26], #0x2\n"
679 "str h1, [x24], #0x2\n"
680 "tbz %x[width], #0, 31f\n"
681 "st1 { v5.b }[2], [x26], #0x1\n"
682 "st1 { v1.b }[2], [x24], #0x1\n"
683 "b 31f\n"
684 "26:" // tail loop: unique 3: partial_0_8
685 "tbz %x[width], #0, 31f\n"
686 "str b5, [x26], #0x1\n"
687 "str b1, [x24], #0x1\n"
688 "b 31f\n"
689 "27:" // tail loop: unique 3: partial_2_0
690 "tbz %x[width], #2, 29f\n"
691 "str s7, [x26], #0x4\n"
692 "str s3, [x24], #0x4\n"
693 "tbz %x[width], #1, 28f\n"
694 "st1 { v7.h }[2], [x26], #0x2\n"
695 "st1 { v3.h }[2], [x24], #0x2\n"
696 "tbz %x[width], #0, 31f\n"
697 "st1 { v7.b }[6], [x26], #0x1\n"
698 "st1 { v3.b }[6], [x24], #0x1\n"
699 "b 31f\n"
700 "28:" // tail loop: unique 3: partial_0_4
701 "tbz %x[width], #0, 31f\n"
702 "st1 { v7.b }[4], [x26], #0x1\n"
703 "st1 { v3.b }[4], [x24], #0x1\n"
704 "b 31f\n"
705 "29:" // tail loop: unique 3: partial_1_0
706 "tbz %x[width], #1, 30f\n"
707 "str h7, [x26], #0x2\n"
708 "str h3, [x24], #0x2\n"
709 "tbz %x[width], #0, 31f\n"
710 "st1 { v7.b }[2], [x26], #0x1\n"
711 "st1 { v3.b }[2], [x24], #0x1\n"
712 "b 31f\n"
713 "30:" // tail loop: unique 3: partial_0_0
714 "str b7, [x26], #0x1\n"
715 "str b3, [x24], #0x1\n"
716 "31:" // tail loop: unique 3: Done
717 "subs x23, x23, #0x2\n"
718 "bgt 6b\n"
719 "32:" // odd columns skip
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100720 : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
721 [out_direct] "+&r"(out_direct), [width] "+&r"(width)
722 : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
723 [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
724 [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
725 [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
726 [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
727 [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
728 [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
729 [out_stride] "r"(out_stride)
730 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
731 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
732 "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
Gunes Bayirae72a462023-01-29 13:24:24 +0000733}
734
735} // namespace
736
737namespace arm_compute
738{
739namespace cpu
740{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100741void add_mul_add_s8_neon(const ITensor *input1,
742 const ITensor *input2,
743 const ITensor *bn_mul,
744 const ITensor *bn_add,
745 ITensor *add_output,
746 ITensor *final_output,
747 ConvertPolicy policy,
748 const ActivationLayerInfo &act_info,
749 const Window &window)
Gunes Bayirae72a462023-01-29 13:24:24 +0000750{
751 ARM_COMPUTE_UNUSED(policy);
752
753 const ITensorInfo *final_output_info = final_output->info();
754 const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr;
755 const ITensorInfo *input1_info = input1->info();
756 const ITensorInfo *input2_info = input2->info();
757
758 const size_t out_stride = final_output_info->strides_in_bytes()[1];
759 const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
760 const size_t in0_stride = input1_info->strides_in_bytes()[1];
761 const size_t in1_stride = input2_info->strides_in_bytes()[1];
762
763 int8_t minval = std::numeric_limits<int8_t>::lowest();
764 int8_t maxval = std::numeric_limits<int8_t>::max();
765
766 const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100767 if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
Gunes Bayirae72a462023-01-29 13:24:24 +0000768 {
769 minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
770 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100771 else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
Gunes Bayirae72a462023-01-29 13:24:24 +0000772 {
773 minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
774 maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
775 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100776 else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
Gunes Bayirae72a462023-01-29 13:24:24 +0000777 {
778 minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
779 maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
780 }
781
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100782 const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
783 const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
784 const UniformQuantizationInfo add_output_qinfo =
785 (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
Gunes Bayirae72a462023-01-29 13:24:24 +0000786
787 const int32_t in1_offset = in1_qinfo.offset;
788 const int32_t in2_offset = in2_qinfo.offset;
789 const int32_t out_offset = final_output_qinfo.offset;
790 const int32_t out_direct_offset = add_output_qinfo.offset;
791
792 const float in1_scale = in1_qinfo.scale;
793 const float in2_scale = in2_qinfo.scale;
794 const float out_scale = final_output_qinfo.scale;
795 const float out_direct_scale = add_output_qinfo.scale;
796
797 const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer());
798 const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer());
799
800 // Clear X & Y dimensions on execution window as we handle manually
801 Window win = window;
802 win.set(Window::DimX, Window::Dimension(0, 1, 1));
803 win.set(Window::DimY, Window::Dimension(0, 1, 1));
804
805 Iterator in1_it(input1, window);
806 Iterator in2_it(input2, window);
807 Iterator out_it(final_output, window);
808
809 const size_t width = window.num_iterations(0);
810 const size_t height = window.num_iterations(1);
811
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100812 if (add_output != nullptr)
Gunes Bayirae72a462023-01-29 13:24:24 +0000813 {
814 Iterator add_out_it(add_output, window);
815 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100816 win,
817 [&](const Coordinates &)
818 {
819 a64_add_bn_clamp_direct_s8_fp32_2x16(
820 reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
821 out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
822 reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
823 out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
824 in2_scale, width, height);
825 },
826 in1_it, in2_it, add_out_it, out_it);
Gunes Bayirae72a462023-01-29 13:24:24 +0000827 }
828 else
829 {
830 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100831 win,
832 [&](const Coordinates &)
833 {
834 a64_add_bn_clamp_direct_s8_fp32_2x16(
835 reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
836 reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
837 in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
838 out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
839 },
840 in1_it, in2_it, out_it);
Gunes Bayirae72a462023-01-29 13:24:24 +0000841 }
842}
843} // namespace cpu
844} // namespace arm_compute
845
846#endif // __aarch64__