blob: ea32c9665c2d50c45e380b6ed6853bcb2958b759 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
2 * Copyright (c) 2017-2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __arm__
27
28#include "transpose_interleave_common.hpp"
29
30// Generic unblocked transposed 8x32-bit sized specialisation
31template <>
32template <typename T>
33inline void TransformImpl<8, 1, true, 4, 4>::Transform(
34 T *out, const T *const in, const int stride,
35 const int x0, const int xmax, const int k0, const int kmax)
36{
37 // Redirect to a 16x uint16_t specialisation
38 TransformImpl<16, 1, true, 2, 2>::Transform(
39 reinterpret_cast<uint16_t *>(out),
40 reinterpret_cast<const uint16_t *const>(in),
41 stride * 2, x0 * 2, xmax * 2, k0, kmax);
42}
43
44// Generic 12x16-bit sized specialisation
45template <>
46template <typename T>
47inline void TransformImpl<16, 1, true, 2, 2>::Transform(
48 T *out, const T *const in, const int stride,
49 const int x0, const int xmax, const int k0, const int kmax)
50{
51 // Redirect to a uint16_t specialisation
52 Transform(
53 reinterpret_cast<uint16_t *>(out),
54 reinterpret_cast<const uint16_t *const>(in),
55 stride, x0, xmax, k0, kmax);
56}
57
58// Specialised 16 x uint16_t version
59template <>
60inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
61{
62 __asm volatile(
63 "VLD1.32 {d0-d3}, [%[in0]]!\n"
64 "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
65 : [in0] "+r"(in0),
66 [out] "+r"(out)
67 :
68 : "q0", "q1", "memory");
69}
70
71template <>
72inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
73{
74 __asm volatile(
75 "VLD1.32 {d0-d3}, [%[in0]]!\n"
76 "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
77 "VLD1.32 {d0-d3}, [%[in1]]!\n"
78 "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB %[out], %[out], #32\n"
79 : [in0] "+r"(in0),
80 [in1] "+r"(in1),
81 [out] "+r"(out)
82 :
83 : "q0", "q1", "memory");
84}
85
86template <>
87inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
88{
89 __asm __volatile(
90 "VLD1.32 {d0-d3}, [%[in0]]!\n"
91 "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
92 "VLD1.32 {d0-d3}, [%[in1]]!\n"
93 "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
94 "VLD1.32 {d0-d3}, [%[in2]]!\n"
95 "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
96 "VLD1.32 {d0-d3}, [%[in3]]!\n"
97 "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB %[out], %[out], #96\n"
98 : [in0] "+r"(in0),
99 [in1] "+r"(in1),
100 [in2] "+r"(in2),
101 [in3] "+r"(in3),
102 [out] "+r"(out)
103 :
104 : "q0", "q1", "memory");
105}
106
107template <>
108template <>
109inline void TransformImpl<16, 1, true, 2, 2>::Transform(
110 uint16_t *out, const uint16_t *const in, const int stride,
111 const int x0, const int xmax, const int k0, const int kmax)
112{
113 TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
114}
115
116#endif // __arm__