blob: fd6a253c6a5b27868c1ec3451b898de2fd21c540 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
2 * Copyright (c) 2017-2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __aarch64__
27
28#include "transpose_interleave_common.hpp"
29
30// Generic unblocked transposed 6x32-bit sized specialisation
31template <>
32template <typename T>
33inline void TransformImpl<6, 1, true, 4, 4>::Transform(
34 T *out, const T *const in, const int stride,
35 const int x0, const int xmax, const int k0, const int kmax)
36{
37 // Redirect to a 12 x uint16_t specialisation
38 TransformImpl<12, 1, true, 2, 2>::Transform(
39 reinterpret_cast<uint16_t *>(out),
40 reinterpret_cast<const uint16_t *const>(in),
41 stride * 2, x0 * 2, xmax * 2, k0, kmax);
42}
43
44// Generic 12x16-bit sized specialisation
45template <>
46template <typename T>
47inline void TransformImpl<12, 1, true, 2, 2>::Transform(
48 T *out, const T *const in, const int stride,
49 const int x0, const int xmax, const int k0, const int kmax)
50{
51 // Redirect to a uint16_t specialisation
52 Transform(
53 reinterpret_cast<uint16_t *>(out),
54 reinterpret_cast<const uint16_t *const>(in),
55 stride, x0, xmax, k0, kmax);
56}
57
58// Specialised 12 x uint16_t version
59template <>
60inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
61{
62 __asm volatile(
63 "LDR q0, [%[in0]]\n"
64 "STR q0, [%[out]]\n"
65 "LDR d1, [%[in0], #0x10]\n"
66 "STR d1, [%[out], #0x10]\n"
67 "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
68 : [in0] "+r"(in0),
69 [out] "+r"(out)
70 :
71 : "v0", "v1", "memory");
72}
73
74template <>
75inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
76{
77 __asm volatile(
78 "LDR q0, [%[in0]]\n"
79 "LDR d1, [%[in0], #0x10]\n"
80 "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
81
82 "LDR x21, [%[in1]]\n"
83 "LDR q2, [%[in1], #0x08]\n"
84 "INS v1.d[1], x21\n"
85 "ADD %x[in1], %x[in1], #0x18\n"
86 "STP q0, q1, [%[out]]\n"
87 "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
88 : [in0] "+r"(in0),
89 [in1] "+r"(in1),
90 [out] "+r"(out)
91 :
92 : "x21", "v0", "v1", "v2", "memory");
93}
94
95template <>
96inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
97{
98 __asm __volatile(
99 "LDR q0, [%x[in0]], #0x10\n"
100 "STR q0, [%x[out]]\n"
101 "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
102 "STR d1, [%x[out], #0x10]\n"
103
104 "LDR q0, [%x[in1]], #0x10\n"
105 "STR q0, [%x[out], #0x18]\n"
106 "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
107 "STR d1, [%x[out], #0x28]\n"
108
109 "LDR q0, [%x[in2]], #0x10\n"
110 "STR q0, [%x[out], #0x30]\n"
111 "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
112 "STR d1, [%x[out], #0x40]\n"
113
114 "LDR q0, [%x[in3]], #0x10\n"
115 "STR q0, [%x[out], #0x48]\n"
116 "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
117 : [in0] "+r"(in0),
118 [in1] "+r"(in1),
119 [in2] "+r"(in2),
120 [in3] "+r"(in3),
121 [out] "+r"(out)
122 :
123 : "v0", "v1", "memory");
124}
125
126template <>
127template <>
128inline void TransformImpl<12, 1, true, 2, 2>::Transform(
129 uint16_t *out, const uint16_t *const in, const int stride,
130 const int x0, const int xmax, const int k0, const int kmax)
131{
132 TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
133}
134
135#endif // __aarch64__