blob: 5095ecf5bd3f6edfab44052e39c5662cf4656379 [file] [log] [blame]
Michalis Spyrouc4d45552020-10-19 12:41:30 +01001/*
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +01002 * Copyright (c) 2020-2022 Arm Limited.
Michalis Spyrouc4d45552020-10-19 12:41:30 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm_compute/core/Helpers.h"
26#include "arm_compute/core/Window.h"
27#include "src/core/NEON/NEAsymm.h"
28#include "src/core/NEON/NEMath.h"
29#include "src/core/NEON/wrapper/wrapper.h"
Michalis Spyrouc4d45552020-10-19 12:41:30 +010030
31#include <arm_neon.h>
32#include <cmath>
33#include <cstddef>
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +010034#include <cstdint>
Michalis Spyrouc4d45552020-10-19 12:41:30 +010035
36namespace arm_compute
37{
38namespace cpu
39{
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +010040namespace
41{
42#ifdef __aarch64__
43
44void substitute_bytes_neon(
45 const uint8_t *table,
46 size_t num_strings,
47 size_t string_length,
48 const uint8_t *const *input,
49 uint8_t *const *output)
50{
51 __asm__ __volatile__(
52 "ldr q16, [%x[table], #0x0]\n"
53 "ldr q17, [%x[table], #0x10]\n"
54 "mov x22, #0x0\n"
55 "ldr q18, [%x[table], #0x20]\n"
56 "ldr q19, [%x[table], #0x30]\n"
57 "ldr q20, [%x[table], #0x40]\n"
58 "ldr q21, [%x[table], #0x50]\n"
59 "ldr q22, [%x[table], #0x60]\n"
60 "ldr q23, [%x[table], #0x70]\n"
61 "ldr q24, [%x[table], #0x80]\n"
62 "ldr q25, [%x[table], #0x90]\n"
63 "ldr q26, [%x[table], #0xa0]\n"
64 "ldr q27, [%x[table], #0xb0]\n"
65 "ldr q28, [%x[table], #0xc0]\n"
66 "ldr q29, [%x[table], #0xd0]\n"
67 "ldr q30, [%x[table], #0xe0]\n"
68 "ldr q31, [%x[table], #0xf0]\n"
69 "1:" // string loop
70 "ldr x21, [%x[input], x22, LSL #0x3]\n"
71 "ldr x20, [%x[output], x22, LSL #0x3]\n"
72 "movi v12.16b, #0x40\n"
73 "movi v11.16b, #0x80\n"
74 "movi v10.16b, #0xc0\n"
75 "mov x19, %x[string_length]\n"
76 "2:" // 4 rounds: width loop
77 "cmp x19, #0x30\n"
78 "bge 27f\n"
79 "tbz x19, #5, 10f\n"
80 "ld1 { v9.16b }, [x21], #0x10\n"
81 "ld1 { v13.16b }, [x21], #0x10\n"
82 "tbz x19, #3, 6f\n"
83 "ldr d14, [x21], #0x8\n"
84 "tbz x19, #2, 4f\n"
85 "ld1 { v14.s }[2], [x21], #0x4\n"
86 "tbz x19, #1, 3f\n"
87 "ld1 { v14.h }[6], [x21], #0x2\n"
88 "tbz x19, #0, 26f\n"
89 "ld1 { v14.b }[14], [x21]\n"
90 "b 26f\n"
91 "3:" // 4 rounds: Partial load: partial_1_44
92 "tbz x19, #0, 26f\n"
93 "ld1 { v14.b }[12], [x21]\n"
94 "b 26f\n"
95 "4:" // 4 rounds: Partial load: partial_2_40
96 "tbz x19, #1, 5f\n"
97 "ld1 { v14.h }[4], [x21], #0x2\n"
98 "tbz x19, #0, 26f\n"
99 "ld1 { v14.b }[10], [x21]\n"
100 "b 26f\n"
101 "5:" // 4 rounds: Partial load: partial_1_40
102 "tbz x19, #0, 26f\n"
103 "ld1 { v14.b }[8], [x21]\n"
104 "b 26f\n"
105 "6:" // 4 rounds: Partial load: partial_4_32
106 "tbz x19, #2, 8f\n"
107 "ldr s14, [x21], #0x4\n"
108 "tbz x19, #1, 7f\n"
109 "ld1 { v14.h }[2], [x21], #0x2\n"
110 "tbz x19, #0, 26f\n"
111 "ld1 { v14.b }[6], [x21]\n"
112 "b 26f\n"
113 "7:" // 4 rounds: Partial load: partial_1_36
114 "tbz x19, #0, 26f\n"
115 "ld1 { v14.b }[4], [x21]\n"
116 "b 26f\n"
117 "8:" // 4 rounds: Partial load: partial_2_32
118 "tbz x19, #1, 9f\n"
119 "ldr h14, [x21], #0x2\n"
120 "tbz x19, #0, 26f\n"
121 "ld1 { v14.b }[2], [x21]\n"
122 "b 26f\n"
123 "9:" // 4 rounds: Partial load: partial_1_32
124 "tbz x19, #0, 26f\n"
125 "ldr b14, [x21, #0x0]\n"
126 "b 26f\n"
127 "10:" // 4 rounds: Partial load: partial_16_0
128 "tbz x19, #4, 18f\n"
129 "ld1 { v9.16b }, [x21], #0x10\n"
130 "tbz x19, #3, 14f\n"
131 "ldr d13, [x21], #0x8\n"
132 "tbz x19, #2, 12f\n"
133 "ld1 { v13.s }[2], [x21], #0x4\n"
134 "tbz x19, #1, 11f\n"
135 "ld1 { v13.h }[6], [x21], #0x2\n"
136 "tbz x19, #0, 26f\n"
137 "ld1 { v13.b }[14], [x21]\n"
138 "b 26f\n"
139 "11:" // 4 rounds: Partial load: partial_1_28
140 "tbz x19, #0, 26f\n"
141 "ld1 { v13.b }[12], [x21]\n"
142 "b 26f\n"
143 "12:" // 4 rounds: Partial load: partial_2_24
144 "tbz x19, #1, 13f\n"
145 "ld1 { v13.h }[4], [x21], #0x2\n"
146 "tbz x19, #0, 26f\n"
147 "ld1 { v13.b }[10], [x21]\n"
148 "b 26f\n"
149 "13:" // 4 rounds: Partial load: partial_1_24
150 "tbz x19, #0, 26f\n"
151 "ld1 { v13.b }[8], [x21]\n"
152 "b 26f\n"
153 "14:" // 4 rounds: Partial load: partial_4_16
154 "tbz x19, #2, 16f\n"
155 "ldr s13, [x21], #0x4\n"
156 "tbz x19, #1, 15f\n"
157 "ld1 { v13.h }[2], [x21], #0x2\n"
158 "tbz x19, #0, 26f\n"
159 "ld1 { v13.b }[6], [x21]\n"
160 "b 26f\n"
161 "15:" // 4 rounds: Partial load: partial_1_20
162 "tbz x19, #0, 26f\n"
163 "ld1 { v13.b }[4], [x21]\n"
164 "b 26f\n"
165 "16:" // 4 rounds: Partial load: partial_2_16
166 "tbz x19, #1, 17f\n"
167 "ldr h13, [x21], #0x2\n"
168 "tbz x19, #0, 26f\n"
169 "ld1 { v13.b }[2], [x21]\n"
170 "b 26f\n"
171 "17:" // 4 rounds: Partial load: partial_1_16
172 "tbz x19, #0, 26f\n"
173 "ldr b13, [x21, #0x0]\n"
174 "b 26f\n"
175 "18:" // 4 rounds: Partial load: partial_8_0
176 "tbz x19, #3, 22f\n"
177 "ldr d9, [x21], #0x8\n"
178 "tbz x19, #2, 20f\n"
179 "ld1 { v9.s }[2], [x21], #0x4\n"
180 "tbz x19, #1, 19f\n"
181 "ld1 { v9.h }[6], [x21], #0x2\n"
182 "tbz x19, #0, 26f\n"
183 "ld1 { v9.b }[14], [x21]\n"
184 "b 26f\n"
185 "19:" // 4 rounds: Partial load: partial_1_12
186 "tbz x19, #0, 26f\n"
187 "ld1 { v9.b }[12], [x21]\n"
188 "b 26f\n"
189 "20:" // 4 rounds: Partial load: partial_2_8
190 "tbz x19, #1, 21f\n"
191 "ld1 { v9.h }[4], [x21], #0x2\n"
192 "tbz x19, #0, 26f\n"
193 "ld1 { v9.b }[10], [x21]\n"
194 "b 26f\n"
195 "21:" // 4 rounds: Partial load: partial_1_8
196 "tbz x19, #0, 26f\n"
197 "ld1 { v9.b }[8], [x21]\n"
198 "b 26f\n"
199 "22:" // 4 rounds: Partial load: partial_4_0
200 "tbz x19, #2, 24f\n"
201 "ldr s9, [x21], #0x4\n"
202 "tbz x19, #1, 23f\n"
203 "ld1 { v9.h }[2], [x21], #0x2\n"
204 "tbz x19, #0, 26f\n"
205 "ld1 { v9.b }[6], [x21]\n"
206 "b 26f\n"
207 "23:" // 4 rounds: Partial load: partial_1_4
208 "tbz x19, #0, 26f\n"
209 "ld1 { v9.b }[4], [x21]\n"
210 "b 26f\n"
211 "24:" // 4 rounds: Partial load: partial_2_0
212 "tbz x19, #1, 25f\n"
213 "ldr h9, [x21], #0x2\n"
214 "tbz x19, #0, 26f\n"
215 "ld1 { v9.b }[2], [x21]\n"
216 "b 26f\n"
217 "25:" // 4 rounds: Partial load: partial_1_0
218 "ldr b9, [x21, #0x0]\n"
219 "26:" // 4 rounds: Partial load: Done
220 "b 28f\n"
221 "27:" // 4 rounds: Full load
222 "ldr q9, [x21, #0x0]\n"
223 "ldr q13, [x21, #0x10]\n"
224 "ldr q14, [x21, #0x20]\n"
225 "add x21, x21, #0x30\n"
226 "28:" // 4 rounds: Load done
227 "sub v8.16b, v9.16b, v12.16b\n"
228 "sub v7.16b, v9.16b, v11.16b\n"
229 "tbl v8.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v8.16b\n"
230 "sub v6.16b, v9.16b, v10.16b\n"
231 "sub v5.16b, v13.16b, v12.16b\n"
232 "tbl v9.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v9.16b\n"
233 "sub v4.16b, v13.16b, v11.16b\n"
234 "sub v3.16b, v13.16b, v10.16b\n"
235 "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
236 "sub v2.16b, v14.16b, v12.16b\n"
237 "sub v1.16b, v14.16b, v11.16b\n"
238 "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
239 "sub v0.16b, v14.16b, v10.16b\n"
240 "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
241 "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
242 "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
243 "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
244 "orr v9.16b, v9.16b, v8.16b\n"
245 "tbl v14.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v14.16b\n"
246 "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
247 "orr v7.16b, v7.16b, v6.16b\n"
248 "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
249 "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
250 "orr v13.16b, v13.16b, v5.16b\n"
251 "orr v4.16b, v4.16b, v3.16b\n"
252 "orr v14.16b, v14.16b, v2.16b\n"
253 "cmp x19, #0x30\n"
254 "orr v1.16b, v1.16b, v0.16b\n"
255 "orr v9.16b, v9.16b, v7.16b\n"
256 "orr v13.16b, v13.16b, v4.16b\n"
257 "orr v14.16b, v14.16b, v1.16b\n"
258 "bge 53f\n"
259 "tbz x19, #5, 36f\n"
260 "st1 { v9.16b }, [x20], #0x10\n"
261 "st1 { v13.16b }, [x20], #0x10\n"
262 "tbz x19, #3, 32f\n"
263 "str d14, [x20], #0x8\n"
264 "tbz x19, #2, 30f\n"
265 "st1 { v14.s }[2], [x20], #0x4\n"
266 "tbz x19, #1, 29f\n"
267 "st1 { v14.h }[6], [x20], #0x2\n"
268 "tbz x19, #0, 52f\n"
269 "st1 { v14.b }[14], [x20]\n"
270 "b 52f\n"
271 "29:" // 4 rounds: Partial writeback: partial_1_44
272 "tbz x19, #0, 52f\n"
273 "st1 { v14.b }[12], [x20]\n"
274 "b 52f\n"
275 "30:" // 4 rounds: Partial writeback: partial_2_40
276 "tbz x19, #1, 31f\n"
277 "st1 { v14.h }[4], [x20], #0x2\n"
278 "tbz x19, #0, 52f\n"
279 "st1 { v14.b }[10], [x20]\n"
280 "b 52f\n"
281 "31:" // 4 rounds: Partial writeback: partial_1_40
282 "tbz x19, #0, 52f\n"
283 "st1 { v14.b }[8], [x20]\n"
284 "b 52f\n"
285 "32:" // 4 rounds: Partial writeback: partial_4_32
286 "tbz x19, #2, 34f\n"
287 "str s14, [x20], #0x4\n"
288 "tbz x19, #1, 33f\n"
289 "st1 { v14.h }[2], [x20], #0x2\n"
290 "tbz x19, #0, 52f\n"
291 "st1 { v14.b }[6], [x20]\n"
292 "b 52f\n"
293 "33:" // 4 rounds: Partial writeback: partial_1_36
294 "tbz x19, #0, 52f\n"
295 "st1 { v14.b }[4], [x20]\n"
296 "b 52f\n"
297 "34:" // 4 rounds: Partial writeback: partial_2_32
298 "tbz x19, #1, 35f\n"
299 "str h14, [x20], #0x2\n"
300 "tbz x19, #0, 52f\n"
301 "st1 { v14.b }[2], [x20]\n"
302 "b 52f\n"
303 "35:" // 4 rounds: Partial writeback: partial_1_32
304 "tbz x19, #0, 52f\n"
305 "str b14, [x20, #0x0]\n"
306 "b 52f\n"
307 "36:" // 4 rounds: Partial writeback: partial_16_0
308 "tbz x19, #4, 44f\n"
309 "st1 { v9.16b }, [x20], #0x10\n"
310 "tbz x19, #3, 40f\n"
311 "str d13, [x20], #0x8\n"
312 "tbz x19, #2, 38f\n"
313 "st1 { v13.s }[2], [x20], #0x4\n"
314 "tbz x19, #1, 37f\n"
315 "st1 { v13.h }[6], [x20], #0x2\n"
316 "tbz x19, #0, 52f\n"
317 "st1 { v13.b }[14], [x20]\n"
318 "b 52f\n"
319 "37:" // 4 rounds: Partial writeback: partial_1_28
320 "tbz x19, #0, 52f\n"
321 "st1 { v13.b }[12], [x20]\n"
322 "b 52f\n"
323 "38:" // 4 rounds: Partial writeback: partial_2_24
324 "tbz x19, #1, 39f\n"
325 "st1 { v13.h }[4], [x20], #0x2\n"
326 "tbz x19, #0, 52f\n"
327 "st1 { v13.b }[10], [x20]\n"
328 "b 52f\n"
329 "39:" // 4 rounds: Partial writeback: partial_1_24
330 "tbz x19, #0, 52f\n"
331 "st1 { v13.b }[8], [x20]\n"
332 "b 52f\n"
333 "40:" // 4 rounds: Partial writeback: partial_4_16
334 "tbz x19, #2, 42f\n"
335 "str s13, [x20], #0x4\n"
336 "tbz x19, #1, 41f\n"
337 "st1 { v13.h }[2], [x20], #0x2\n"
338 "tbz x19, #0, 52f\n"
339 "st1 { v13.b }[6], [x20]\n"
340 "b 52f\n"
341 "41:" // 4 rounds: Partial writeback: partial_1_20
342 "tbz x19, #0, 52f\n"
343 "st1 { v13.b }[4], [x20]\n"
344 "b 52f\n"
345 "42:" // 4 rounds: Partial writeback: partial_2_16
346 "tbz x19, #1, 43f\n"
347 "str h13, [x20], #0x2\n"
348 "tbz x19, #0, 52f\n"
349 "st1 { v13.b }[2], [x20]\n"
350 "b 52f\n"
351 "43:" // 4 rounds: Partial writeback: partial_1_16
352 "tbz x19, #0, 52f\n"
353 "str b13, [x20, #0x0]\n"
354 "b 52f\n"
355 "44:" // 4 rounds: Partial writeback: partial_8_0
356 "tbz x19, #3, 48f\n"
357 "str d9, [x20], #0x8\n"
358 "tbz x19, #2, 46f\n"
359 "st1 { v9.s }[2], [x20], #0x4\n"
360 "tbz x19, #1, 45f\n"
361 "st1 { v9.h }[6], [x20], #0x2\n"
362 "tbz x19, #0, 52f\n"
363 "st1 { v9.b }[14], [x20]\n"
364 "b 52f\n"
365 "45:" // 4 rounds: Partial writeback: partial_1_12
366 "tbz x19, #0, 52f\n"
367 "st1 { v9.b }[12], [x20]\n"
368 "b 52f\n"
369 "46:" // 4 rounds: Partial writeback: partial_2_8
370 "tbz x19, #1, 47f\n"
371 "st1 { v9.h }[4], [x20], #0x2\n"
372 "tbz x19, #0, 52f\n"
373 "st1 { v9.b }[10], [x20]\n"
374 "b 52f\n"
375 "47:" // 4 rounds: Partial writeback: partial_1_8
376 "tbz x19, #0, 52f\n"
377 "st1 { v9.b }[8], [x20]\n"
378 "b 52f\n"
379 "48:" // 4 rounds: Partial writeback: partial_4_0
380 "tbz x19, #2, 50f\n"
381 "str s9, [x20], #0x4\n"
382 "tbz x19, #1, 49f\n"
383 "st1 { v9.h }[2], [x20], #0x2\n"
384 "tbz x19, #0, 52f\n"
385 "st1 { v9.b }[6], [x20]\n"
386 "b 52f\n"
387 "49:" // 4 rounds: Partial writeback: partial_1_4
388 "tbz x19, #0, 52f\n"
389 "st1 { v9.b }[4], [x20]\n"
390 "b 52f\n"
391 "50:" // 4 rounds: Partial writeback: partial_2_0
392 "tbz x19, #1, 51f\n"
393 "str h9, [x20], #0x2\n"
394 "tbz x19, #0, 52f\n"
395 "st1 { v9.b }[2], [x20]\n"
396 "b 52f\n"
397 "51:" // 4 rounds: Partial writeback: partial_1_0
398 "str b9, [x20, #0x0]\n"
399 "52:" // 4 rounds: Partial writeback: Done
400 "b 54f\n"
401 "53:" // 4 rounds: Full writeback
402 "str q9, [x20, #0x0]\n"
403 "str q13, [x20, #0x10]\n"
404 "str q14, [x20, #0x20]\n"
405 "add x20, x20, #0x30\n"
406 "54:" // 4 rounds: Writeback done
407 "subs x19, x19, #0x30\n"
408 "bgt 2b\n"
409 "add x22, x22, #0x1\n"
410 "cmp x22, %x[num_strings]\n"
411 "bne 1b\n"
412 :
413 : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table)
414 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22");
415}
416
417#endif // __aarch64__
418} // namespace
419
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100420void neon_qasymm8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100421{
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100422 ARM_COMPUTE_ERROR_ON(!ActivationLayerInfo::is_lut_supported(act_info.activation(), src->info()->data_type()));
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100423#ifdef __aarch64__
Pablo Marquez Tello41eb2d92022-06-23 16:02:05 +0100424 const int window_step_x = src->info()->tensor_shape().x();
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100425 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
426 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100427 Iterator input(src, win_collapsed);
428 Iterator output(dst, win_collapsed);
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100429 execute_window_loop(win_collapsed, [&](const Coordinates &)
430 {
Pablo Marquez Tello41eb2d92022-06-23 16:02:05 +0100431 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
432 auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
433 substitute_bytes_neon(act_info.lut().data(), 1u, window_step_x, &input_ptr, &output_ptr);
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100434 },
435 input, output);
436#else // #ifdef __aarch64__
437 ARM_COMPUTE_UNUSED(src);
438 ARM_COMPUTE_UNUSED(dst);
439 ARM_COMPUTE_UNUSED(act_info);
440 ARM_COMPUTE_UNUSED(window);
441 ARM_COMPUTE_ERROR("LUT Only supported in aarch64.");
442#endif // __aarch64__
443}
444
Dana Zlotnik32291712021-11-25 09:58:27 +0200445void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
Michalis Spyrouc4d45552020-10-19 12:41:30 +0100446{
447 constexpr int window_step_x = 16;
448 const auto window_start_x = static_cast<int>(window.x().start());
449 const auto window_end_x = static_cast<int>(window.x().end());
450 const ActivationLayerInfo::ActivationFunction act = act_info.activation();
451
452 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
453 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
454
455 Iterator input(src, win_collapsed);
456 Iterator output(dst, win_collapsed);
457
Sang-Hoon Parkadd8e812020-11-25 11:46:03 +0000458 const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
459 const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
460 const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
461 const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
462 const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
463 const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
464 const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
465 const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
466 const auto vconst_1 = vdupq_n_f32(1.f);
467#ifndef __aarch64__
468 const auto vconst_0_f32 = vdupq_n_f32(0);
469#endif // __aarch64__
Pablo Marquez Tellod75cd8a2022-05-26 14:19:39 +0100470 const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
471 const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
472 const float a_f32 = act_info.a();
473 const float b_f32 = act_info.b();
Michalis Spyrouc4d45552020-10-19 12:41:30 +0100474
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100475#ifndef __aarch64__
476 const auto const_6_f32 = vdupq_n_f32(6.f);
477 const auto const_0_f32 = vdupq_n_f32(0.f);
478 const auto const_3_f32 = vdupq_n_f32(3.f);
479 const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
480#endif // __aarch64__
481
Michalis Spyrouc4d45552020-10-19 12:41:30 +0100482 // Initialise scale/offset for re-quantization
483 float s = qi_in.scale / qi_out.scale;
484 float o = -qi_in.offset * s + qi_out.offset;
485 float32x4_t vs = vdupq_n_f32(s);
486 float32x4_t vo = vdupq_n_f32(o);
487
488 execute_window_loop(win_collapsed, [&](const Coordinates &)
489 {
490 const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
491 const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
492
493 wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
494
495 // Compute S elements per iteration
496 int x = window_start_x;
497 for(; x <= (window_end_x - window_step_x); x += window_step_x)
498 {
499 const auto vin = wrapper::vloadq(input_ptr + x);
500 if(act == ActivationLayerInfo::ActivationFunction::RELU)
501 {
502 // Perform activation
503 tmp = vmaxq_u8(vconst_0, vin);
504 // Re-quantize to new output space
505 tmp = vmlaq_qasymm8(tmp, vs, vo);
506 }
507 else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
508 {
509 // Perform activation
510 tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
511 // Re-quantize to new output space
512 tmp = vmlaq_qasymm8(tmp, vs, vo);
513 }
514 else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
515 {
516 // Perform activation
517 tmp = vminq_u8(va, vmaxq_u8(vb, vin));
518 // Re-quantize to new output space
519 tmp = vmlaq_qasymm8(tmp, vs, vo);
520 }
521 else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
522 {
523 // De-quantize
524 const auto vin_deq = vdequantize(vin, qi_in);
525 // Perform activation
526 const float32x4x4_t tmp_dep =
527 {
528 {
529 wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
530 wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
531 wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
532 wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
533 }
534 };
535 // Re-quantize to new output space
536 tmp = vquantize(tmp_dep, qi_out);
537 }
538 else if(act == ActivationLayerInfo::ActivationFunction::TANH)
539 {
540 // De-quantize
541 const auto vin_deq = vdequantize(vin, qi_in);
542 // Perform activation
543 const float32x4x4_t tmp_dep =
544 {
545 {
546 wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
547 wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
548 wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
549 wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
550 }
551 };
552 // Re-quantize to new output space
553 tmp = vquantize(tmp_dep, qi_out);
554 }
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100555#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
556 else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
557 {
558 // De-quantize
559 const auto vin_deq = vdequantize(vin, qi_in);
560 // Perform activation
561 const float32x4x4_t tmp_dep =
562 {
563 {
564 wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
565 wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
566 wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
567 wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
568 }
569 };
570 // Re-quantize to new output space
571 tmp = vquantize(tmp_dep, qi_out);
572 }
Sang-Hoon Parkadd8e812020-11-25 11:46:03 +0000573 else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
574 {
575 const auto vin_deq = vdequantize(vin, qi_in);
576
Sang-Hoon Parkadd8e812020-11-25 11:46:03 +0000577 const uint32x4x4_t pos_mask =
578 {
579 {
580 wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
581 wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
582 wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
583 wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
584 }
585 };
Sang-Hoon Parkadd8e812020-11-25 11:46:03 +0000586
587 const float32x4x4_t tmp_dep =
588 {
589 {
590 wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
591 wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
592 wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
593 wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
594 }
595 };
596
597 tmp = vquantize(tmp_dep, qi_out);
598 }
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100599#endif // __aarch64__
Michalis Spyrouc4d45552020-10-19 12:41:30 +0100600 else
601 {
602 ARM_COMPUTE_ERROR("Unsupported activation function");
603 }
604 wrapper::vstore(output_ptr + x, tmp);
605 }
606
607 // Compute left-over elements
608 for(; x < window_end_x; ++x)
609 {
610 qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
611 qasymm8_t tmp = 0;
612 if(act == ActivationLayerInfo::ActivationFunction::RELU)
613 {
614 tmp = std::max(const_0, in);
615 tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
616 }
617 else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
618 {
619 tmp = std::min(a, std::max(const_0, in));
620 tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
621 }
622 else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
623 {
624 tmp = std::min(a, std::max(b, in));
625 tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
626 }
627 else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
628 {
629 float tmp_f = dequantize_qasymm8(in, qi_in);
630 tmp_f = 1.f / (1.f + std::exp(-tmp_f));
631 tmp = quantize_qasymm8(tmp_f, qi_out);
632 }
633 else if(act == ActivationLayerInfo::ActivationFunction::TANH)
634 {
635 float tmp_f = dequantize_qasymm8(in, qi_in);
636 tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
637 tmp = quantize_qasymm8(tmp_f, qi_out);
638 }
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100639#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
640 else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
641 {
642 float tmp_f = dequantize_qasymm8(in, qi_in);
643 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
644 tmp = quantize_qasymm8(tmp_f, qi_out);
645 }
Sang-Hoon Parkadd8e812020-11-25 11:46:03 +0000646 else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
647 {
648 float tmp_f = dequantize_qasymm8(in, qi_in);
649 tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
650 tmp = quantize_qasymm8(tmp_f, qi_out);
651 }
Viet-Hoa Dob042e392022-06-21 15:56:15 +0100652#endif // __aarch64__
Michalis Spyrouc4d45552020-10-19 12:41:30 +0100653 else
654 {
655 ARM_COMPUTE_ERROR("Unsupported activation function");
656 }
657 *(output_ptr + x) = tmp;
658 }
659 },
660 input, output);
661}
662} // namespace cpu
663} // namespace arm_compute