blob: 56ca49a36e145ed03808b95b196a902cd5b0d70c [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
Michael Tyler7d9a6262023-02-01 16:37:07 +00002 * Copyright (c) 2019-2021, 2023 Arm Limited.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
Michael Tyler7d9a6262023-02-01 16:37:07 +000013 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000015 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
Michael Tyler7d9a6262023-02-01 16:37:07 +000020 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000023 */
24
25#ifdef __aarch64__
26
27template<>
28void interleave_block<4, 16, VLType::None, true>(
29 int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
30 size_t row_offset, bool first
31)
32{
33 __asm__ __volatile__(
Michael Tyler7d9a6262023-02-01 16:37:07 +000034 "ldr x24, [%x[in], #0x0]\n"
35 "ldr x23, [%x[in], #0x8]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000036 "cmp %x[height], #0x4\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000037 "mov x22, #0x0\n"
38 "ldr x21, [%x[in], #0x10]\n"
39 "ldr x20, [%x[in], #0x18]\n"
40 "movi v28.8h, #0x0\n"
41 "movi v27.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000042 "movi v26.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000043 "movi v25.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000044 "add x24, x24, %x[row_offset]\n"
45 "add x23, x23, %x[row_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000046 "movi v24.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000047 "movi v23.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000048 "add x21, x21, %x[row_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000049 "add x20, x20, %x[row_offset]\n"
50 "movi v22.4s, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000051 "movi v21.4s, #0x0\n"
52 "beq 1f\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000053 "cmp %x[height], #0x2\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000054 "mov x20, x24\n"
55 "csel x23, x23, x24, GE\n"
56 "csel x21, x21, x24, GT\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000057 "1:" // no_pointer_adj
Michael Tyler7d9a6262023-02-01 16:37:07 +000058 "prfm pldl1keep, [x24, #0x0]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000059 "prfm pldl1keep, [x23, #0x0]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000060 "movi v20.4s, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000061 "prfm pldl1keep, [x21, #0x0]\n"
62 "prfm pldl1keep, [x20, #0x0]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000063 "prfm pldl1keep, [x24, #0x40]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000064 "prfm pldl1keep, [x23, #0x40]\n"
65 "prfm pldl1keep, [x21, #0x40]\n"
66 "prfm pldl1keep, [x20, #0x40]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000067 "cbnz %w[first], 2f\n"
68 "sub %x[out_ptr], %x[out_ptr], #0x10\n"
69 "ld1 { v20.4s }, [%x[out_ptr]]\n"
70 "2:" // first_pass
71 "cmp %x[width], #0x10\n"
72 "blt 5f\n"
73 "3:" // Main loop head
74 "cmp x22, #0x7e\n"
75 "ble 4f\n"
76 "sadalp v24.4s, v28.8h\n"
77 "movi v28.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000078 "mov x22, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000079 "sadalp v23.4s, v27.8h\n"
80 "movi v27.8h, #0x0\n"
81 "sadalp v22.4s, v26.8h\n"
82 "movi v26.8h, #0x0\n"
83 "sadalp v21.4s, v25.8h\n"
84 "movi v25.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000085 "4:" // no_accumulate_16
Michael Tyler7d9a6262023-02-01 16:37:07 +000086 "ldr q19, [x24], #0x10\n"
87 "ldr q18, [x23], #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000088 "subs %x[width], %x[width], #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000089 "cmp %x[width], #0x10\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000090 "ldr q17, [x21], #0x10\n"
91 "ldr q16, [x20], #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000092 "str q19, [%x[out_ptr], #0x0]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000093 "sadalp v28.8h, v19.16b\n"
94 "prfm pldl1keep, [x24, #0x70]\n"
95 "prfm pldl1keep, [x23, #0x70]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000096 "str q18, [%x[out_ptr], #0x10]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000097 "sadalp v27.8h, v18.16b\n"
98 "prfm pldl1keep, [x21, #0x70]\n"
99 "prfm pldl1keep, [x20, #0x70]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000100 "str q17, [%x[out_ptr], #0x20]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000101 "sadalp v26.8h, v17.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000102 "str q16, [%x[out_ptr], #0x30]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000103 "sadalp v25.8h, v16.16b\n"
104 "add x22, x22, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000105 "add %x[out_ptr], %x[out_ptr], #0x40\n"
106 "bge 3b\n"
107 "5:" // Main loop skip
108 "cbz %x[width], 14f\n"
109 "tbz %x[width], #3, 9f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000110 "ldr d19, [x24], #0x8\n"
111 "ldr d18, [x23], #0x8\n"
112 "ldr d17, [x21], #0x8\n"
113 "ldr d16, [x20], #0x8\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000114 "tbz %x[width], #2, 7f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000115 "ld1 { v19.s }[2], [x24], #0x4\n"
116 "ld1 { v18.s }[2], [x23], #0x4\n"
117 "ld1 { v17.s }[2], [x21], #0x4\n"
118 "ld1 { v16.s }[2], [x20], #0x4\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000119 "tbz %x[width], #1, 6f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000120 "ld1 { v19.h }[6], [x24], #0x2\n"
121 "ld1 { v18.h }[6], [x23], #0x2\n"
122 "ld1 { v17.h }[6], [x21], #0x2\n"
123 "ld1 { v16.h }[6], [x20], #0x2\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000124 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000125 "ld1 { v19.b }[14], [x24]\n"
126 "ld1 { v18.b }[14], [x23]\n"
127 "ld1 { v17.b }[14], [x21]\n"
128 "ld1 { v16.b }[14], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000129 "b 13f\n"
130 "6:" // odd_loads_1_12
131 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000132 "ld1 { v19.b }[12], [x24]\n"
133 "ld1 { v18.b }[12], [x23]\n"
134 "ld1 { v17.b }[12], [x21]\n"
135 "ld1 { v16.b }[12], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000136 "b 13f\n"
137 "7:" // odd_loads_2_8
138 "tbz %x[width], #1, 8f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000139 "ld1 { v19.h }[4], [x24], #0x2\n"
140 "ld1 { v18.h }[4], [x23], #0x2\n"
141 "ld1 { v17.h }[4], [x21], #0x2\n"
142 "ld1 { v16.h }[4], [x20], #0x2\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000143 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000144 "ld1 { v19.b }[10], [x24]\n"
145 "ld1 { v18.b }[10], [x23]\n"
146 "ld1 { v17.b }[10], [x21]\n"
147 "ld1 { v16.b }[10], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000148 "b 13f\n"
149 "8:" // odd_loads_1_8
150 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000151 "ld1 { v19.b }[8], [x24]\n"
152 "ld1 { v18.b }[8], [x23]\n"
153 "ld1 { v17.b }[8], [x21]\n"
154 "ld1 { v16.b }[8], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000155 "b 13f\n"
156 "9:" // odd_loads_4_0
157 "tbz %x[width], #2, 11f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000158 "ldr s19, [x24], #0x4\n"
159 "ldr s18, [x23], #0x4\n"
160 "ldr s17, [x21], #0x4\n"
161 "ldr s16, [x20], #0x4\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000162 "tbz %x[width], #1, 10f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000163 "ld1 { v19.h }[2], [x24], #0x2\n"
164 "ld1 { v18.h }[2], [x23], #0x2\n"
165 "ld1 { v17.h }[2], [x21], #0x2\n"
166 "ld1 { v16.h }[2], [x20], #0x2\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000167 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000168 "ld1 { v19.b }[6], [x24]\n"
169 "ld1 { v18.b }[6], [x23]\n"
170 "ld1 { v17.b }[6], [x21]\n"
171 "ld1 { v16.b }[6], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000172 "b 13f\n"
173 "10:" // odd_loads_1_4
174 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000175 "ld1 { v19.b }[4], [x24]\n"
176 "ld1 { v18.b }[4], [x23]\n"
177 "ld1 { v17.b }[4], [x21]\n"
178 "ld1 { v16.b }[4], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000179 "b 13f\n"
180 "11:" // odd_loads_2_0
181 "tbz %x[width], #1, 12f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000182 "ldr h19, [x24], #0x2\n"
183 "ldr h18, [x23], #0x2\n"
184 "ldr h17, [x21], #0x2\n"
185 "ldr h16, [x20], #0x2\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000186 "tbz %x[width], #0, 13f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000187 "ld1 { v19.b }[2], [x24]\n"
188 "ld1 { v18.b }[2], [x23]\n"
189 "ld1 { v17.b }[2], [x21]\n"
190 "ld1 { v16.b }[2], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000191 "b 13f\n"
192 "12:" // odd_loads_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000193 "ldr b19, [x24, #0x0]\n"
194 "ldr b18, [x23, #0x0]\n"
195 "ldr b17, [x21, #0x0]\n"
196 "ldr b16, [x20, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000197 "13:" // Odd load end
198 "str q19, [%x[out_ptr], #0x0]\n"
199 "sadalp v28.8h, v19.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000200 "sadalp v27.8h, v18.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000201 "str q18, [%x[out_ptr], #0x10]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000202 "sadalp v26.8h, v17.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000203 "sadalp v25.8h, v16.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000204 "str q17, [%x[out_ptr], #0x20]\n"
205 "str q16, [%x[out_ptr], #0x30]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000206 "add %x[out_ptr], %x[out_ptr], #0x40\n"
207 "14:" // Odds skip
208 "sadalp v24.4s, v28.8h\n"
209 "sadalp v23.4s, v27.8h\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000210 "sadalp v22.4s, v26.8h\n"
211 "sadalp v21.4s, v25.8h\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000212 "addp v24.4s, v24.4s, v23.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000213 "addp v23.4s, v22.4s, v21.4s\n"
214 "addp v24.4s, v24.4s, v23.4s\n"
215 "add v24.4s, v24.4s, v20.4s\n"
216 "str q24, [%x[out_ptr], #0x0]\n"
217 "add %x[out_ptr], %x[out_ptr], #0x10\n"
Georgios Pinitase28cf392021-01-31 05:18:43 +0000218 : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000219 : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
Michael Tyler7d9a6262023-02-01 16:37:07 +0000220 : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000221 );
222}
223
224
225#endif // __aarch64__