blob: 34d43f5052118a1e0bd7475938359b05e2edcc8c [file] [log] [blame]
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001/*
Michael Tylerbe13cea2023-01-17 11:04:14 +00002 * Copyright (c) 2021 Arm Limited.
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
Michael Tylerbe13cea2023-01-17 11:04:14 +000013 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010015 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
Michael Tylerbe13cea2023-01-17 11:04:14 +000020 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010023 */
24
25#pragma once
26
27#ifdef __ARM_FEATURE_SVE
28
29
30namespace {
31
32void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
33{
34 uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
35
36 if (height % 8) {
37 memset(pad_row, 0, width * sizeof(uint8_t));
38 }
39
40 size_t out_stride = 8 * roundup<size_t>(height, 8) * get_vector_length<uint64_t>();
41
42 __asm__ __volatile__(
43 "ptrue p1.b\n"
44 "1:" // Main row loop: Head
Michael Tylerbe13cea2023-01-17 11:04:14 +000045 "mov x9, %x[in]\n"
46 "mov x28, %x[out]\n"
47 "add x27, x9, %x[in_stride]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010048 "add x26, x27, %x[in_stride]\n"
49 "add x25, x26, %x[in_stride]\n"
50 "add x24, x25, %x[in_stride]\n"
51 "add x23, x24, %x[in_stride]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000052 "add x22, x23, %x[in_stride]\n"
53 "add x21, x22, %x[in_stride]\n"
54 "add %x[in], x21, %x[in_stride]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010055 "cmp %x[height], #0x7\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000056 "csel x21, x21, %x[pad_row], GT\n"
57 "csel x22, x22, %x[pad_row], GE\n"
58 "cmp %x[height], #0x5\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010059 "csel x23, x23, %x[pad_row], GT\n"
60 "csel x24, x24, %x[pad_row], GE\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000061 "cmp %x[height], #0x3\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010062 "csel x25, x25, %x[pad_row], GT\n"
63 "csel x26, x26, %x[pad_row], GE\n"
Michael Tylerba209752022-12-15 12:39:29 +000064 "cmp %x[height], #0x1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000065 "csel x27, x27, %x[pad_row], GT\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010066 "sub %x[height], %x[height], #0x8\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000067 "mov x20, %x[width]\n"
68 "cntb x19, ALL, MUL #2\n"
69 "cmp x20, x19\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010070 "blt 3f\n"
71 "2:" // Main row loop: Unroll column loop
Michael Tylerbe13cea2023-01-17 11:04:14 +000072 "ld1b { z17.b }, p1/Z, [x9]\n"
73 "sub x20, x20, x19\n"
74 "ld1b { z5.b }, p1/Z, [x9, #1, MUL VL]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010075 "addvl x9, x9, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000076 "ld1b { z19.b }, p1/Z, [x27]\n"
77 "cmp x20, x19\n"
78 "ld1b { z4.b }, p1/Z, [x27, #1, MUL VL]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010079 "addvl x27, x27, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000080 "ld1b { z18.b }, p1/Z, [x26]\n"
81 "ld1b { z3.b }, p1/Z, [x26, #1, MUL VL]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010082 "addvl x26, x26, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000083 "ld1b { z2.b }, p1/Z, [x25]\n"
84 "ld1b { z1.b }, p1/Z, [x25, #1, MUL VL]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010085 "addvl x25, x25, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000086 "ld1b { z16.b }, p1/Z, [x24]\n"
87 "zip1 z0.b, z17.b, z16.b\n"
88 "ld1b { z31.b }, p1/Z, [x24, #1, MUL VL]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010089 "addvl x24, x24, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000090 "zip2 z30.b, z17.b, z16.b\n"
91 "ld1b { z17.b }, p1/Z, [x23]\n"
92 "ld1b { z29.b }, p1/Z, [x23, #1, MUL VL]\n"
93 "zip1 z28.b, z5.b, z31.b\n"
94 "ld1b { z16.b }, p1/Z, [x22]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010095 "addvl x23, x23, #2\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000096 "zip1 z27.b, z19.b, z17.b\n"
97 "ld1b { z26.b }, p1/Z, [x22, #1, MUL VL]\n"
98 "addvl x22, x22, #2\n"
99 "zip2 z25.b, z19.b, z17.b\n"
100 "ld1b { z24.b }, p1/Z, [x21]\n"
101 "zip1 z22.b, z4.b, z29.b\n"
102 "ld1b { z23.b }, p1/Z, [x21, #1, MUL VL]\n"
103 "addvl x21, x21, #2\n"
104 "zip1 z21.b, z18.b, z16.b\n"
105 "zip2 z20.b, z18.b, z16.b\n"
106 "zip1 z18.b, z0.b, z21.b\n"
107 "zip1 z19.b, z2.b, z24.b\n"
108 "zip1 z17.b, z27.b, z19.b\n"
109 "zip1 z16.b, z18.b, z17.b\n"
110 "st1b { z16.b }, p1, [x28]\n"
111 "zip2 z16.b, z18.b, z17.b\n"
112 "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
113 "zip2 z18.b, z0.b, z21.b\n"
114 "zip2 z17.b, z27.b, z19.b\n"
115 "zip1 z16.b, z18.b, z17.b\n"
116 "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
117 "zip2 z16.b, z18.b, z17.b\n"
118 "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
119 "zip1 z18.b, z30.b, z20.b\n"
120 "zip2 z19.b, z2.b, z24.b\n"
121 "zip1 z17.b, z25.b, z19.b\n"
122 "zip1 z16.b, z18.b, z17.b\n"
123 "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
124 "zip2 z16.b, z18.b, z17.b\n"
125 "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
126 "zip2 z18.b, z30.b, z20.b\n"
127 "zip2 z17.b, z25.b, z19.b\n"
128 "zip1 z16.b, z18.b, z17.b\n"
129 "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
130 "zip2 z16.b, z18.b, z17.b\n"
131 "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
132 "add x28, x28, %x[out_stride]\n"
133 "zip1 z20.b, z3.b, z26.b\n"
134 "zip1 z19.b, z1.b, z23.b\n"
135 "zip1 z18.b, z28.b, z20.b\n"
136 "zip1 z17.b, z22.b, z19.b\n"
137 "zip1 z16.b, z18.b, z17.b\n"
138 "st1b { z16.b }, p1, [x28]\n"
139 "zip2 z16.b, z18.b, z17.b\n"
140 "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
141 "zip2 z18.b, z28.b, z20.b\n"
142 "zip2 z17.b, z22.b, z19.b\n"
143 "zip1 z16.b, z18.b, z17.b\n"
144 "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
145 "zip2 z16.b, z18.b, z17.b\n"
146 "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
147 "zip2 z22.b, z5.b, z31.b\n"
148 "zip2 z21.b, z3.b, z26.b\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100149 "zip1 z18.b, z22.b, z21.b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000150 "zip2 z20.b, z4.b, z29.b\n"
151 "zip2 z19.b, z1.b, z23.b\n"
152 "zip1 z17.b, z20.b, z19.b\n"
153 "zip1 z16.b, z18.b, z17.b\n"
154 "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
155 "zip2 z16.b, z18.b, z17.b\n"
156 "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
157 "zip2 z18.b, z22.b, z21.b\n"
158 "zip2 z17.b, z20.b, z19.b\n"
159 "zip1 z16.b, z18.b, z17.b\n"
160 "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
161 "zip2 z16.b, z18.b, z17.b\n"
162 "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
163 "add x28, x28, %x[out_stride]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100164 "bge 2b\n"
165 "3:" // Main row loop: Unroll column loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000166 "cbz x20, 5f\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100167 "4:" // Main row loop: Column loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000168 "whilelt p0.b, XZR, x20\n"
169 "ld1b { z18.b }, p0/Z, [x9]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100170 "addvl x9, x9, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000171 "ld1b { z28.b }, p0/Z, [x27]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100172 "addvl x27, x27, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000173 "ld1b { z17.b }, p0/Z, [x26]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100174 "addvl x26, x26, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000175 "ld1b { z27.b }, p0/Z, [x25]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100176 "addvl x25, x25, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000177 "ld1b { z16.b }, p0/Z, [x24]\n"
178 "zip1 z26.b, z18.b, z16.b\n"
179 "ld1b { z25.b }, p0/Z, [x23]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100180 "addvl x24, x24, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000181 "zip2 z24.b, z18.b, z16.b\n"
182 "ld1b { z16.b }, p0/Z, [x22]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100183 "addvl x23, x23, #1\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000184 "zip1 z23.b, z28.b, z25.b\n"
185 "ld1b { z22.b }, p0/Z, [x21]\n"
186 "addvl x22, x22, #1\n"
187 "zip1 z20.b, z17.b, z16.b\n"
188 "addvl x21, x21, #1\n"
189 "zip2 z21.b, z17.b, z16.b\n"
190 "decd x20, ALL, MUL #8\n"
191 "zip1 z18.b, z26.b, z20.b\n"
192 "cmp x20, #0x0\n"
193 "zip1 z19.b, z27.b, z22.b\n"
194 "zip1 z17.b, z23.b, z19.b\n"
195 "zip1 z16.b, z18.b, z17.b\n"
196 "st1b { z16.b }, p1, [x28]\n"
197 "zip2 z16.b, z18.b, z17.b\n"
198 "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
199 "zip2 z18.b, z26.b, z20.b\n"
200 "zip2 z17.b, z23.b, z19.b\n"
201 "zip1 z16.b, z18.b, z17.b\n"
202 "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
203 "zip2 z16.b, z18.b, z17.b\n"
204 "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
205 "zip1 z18.b, z24.b, z21.b\n"
206 "zip2 z20.b, z28.b, z25.b\n"
207 "zip2 z19.b, z27.b, z22.b\n"
208 "zip1 z17.b, z20.b, z19.b\n"
209 "zip1 z16.b, z18.b, z17.b\n"
210 "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
211 "zip2 z16.b, z18.b, z17.b\n"
212 "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
213 "zip2 z18.b, z24.b, z21.b\n"
214 "zip2 z17.b, z20.b, z19.b\n"
215 "zip1 z16.b, z18.b, z17.b\n"
216 "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
217 "zip2 z16.b, z18.b, z17.b\n"
218 "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
219 "add x28, x28, %x[out_stride]\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100220 "bgt 4b\n"
221 "5:" // Main row loop: Column loop skip
Michael Tylerba209752022-12-15 12:39:29 +0000222 "addvl %x[out], %x[out], #8\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000223 "cmp %x[height], #0x1\n"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100224 "bge 1b\n"
225 : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
226 : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
Michael Tylerbe13cea2023-01-17 11:04:14 +0000227 : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100228 );
229}
230
231} // anonymous namespace
232
233template<>
234void Transform<8, 8, true, VLType::SVE>(
235 uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
236{
237 sve_transpose_interleave_8VL_1x8(
238 reinterpret_cast<uint8_t *>(out),
239 reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
240 (xmax-x0) * sizeof(uint8_t) / 1,
241 stride * sizeof(uint8_t),
242 (kmax-k0)
243 );
244}
245
246template<>
247void Transform<8, 8, true, VLType::SVE>(
248 int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
249{
250 sve_transpose_interleave_8VL_1x8(
251 reinterpret_cast<uint8_t *>(out),
252 reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
253 (xmax-x0) * sizeof(int8_t) / 1,
254 stride * sizeof(int8_t),
255 (kmax-k0)
256 );
257}
258
259#endif