blob: 8d61f15cec5bf7a990592f7cdfe74d9afc9039f4 [file] [log] [blame]
Pablo Tello11c3b332018-01-25 15:05:13 +00001/*
2 * Copyright (c) 2017-2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __aarch64__
27
28#include <arm_neon.h>
Pablo Tello11c3b332018-01-25 15:05:13 +000029
Pablo Telloeb82fd22018-02-23 13:43:50 +000030#include "../asmlib.hpp"
31#include "../utils.hpp"
32
33template <>
34template <typename T>
35void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
36{
37 uint8_t *outptr = (uint8_t *)out;
38 const uint8_t *inptr = (uint8_t *)in;
Pablo Tello11c3b332018-01-25 15:05:13 +000039
40 uint8_t zerobuff[16];
41
Pablo Telloeb82fd22018-02-23 13:43:50 +000042 for(int y = y0; y < ymax; y += 4)
43 {
Pablo Tello11c3b332018-01-25 15:05:13 +000044 const uint8_t *inptr0 = inptr + y * ldin + k0;
45 const uint8_t *inptr1 = inptr0 + ldin;
46 const uint8_t *inptr2 = inptr1 + ldin;
47 const uint8_t *inptr3 = inptr2 + ldin;
48
49 prefetch_2x(inptr0);
50 prefetch_2x(inptr1);
51 prefetch_2x(inptr2);
52 prefetch_2x(inptr3);
53
Pablo Telloeb82fd22018-02-23 13:43:50 +000054 int x = (kmax - k0);
55 for(; x > 15; x -= 16)
56 {
Pablo Tello11c3b332018-01-25 15:05:13 +000057 /* Cope with ragged cases by copying from a buffer of zeroes instead */
Pablo Telloeb82fd22018-02-23 13:43:50 +000058 if((y + 3) >= ymax)
59 {
60 switch((y + 3) - ymax)
61 {
Pablo Tello11c3b332018-01-25 15:05:13 +000062 /* Everything falls through in here */
63 case 2:
64 inptr1 = zerobuff;
65 case 1:
66 inptr2 = zerobuff;
67 case 0:
68 inptr3 = zerobuff;
Pablo Tello11c3b332018-01-25 15:05:13 +000069 break;
Pablo Telloeb82fd22018-02-23 13:43:50 +000070
71 default:
72 UNREACHABLE("Impossible.");
Pablo Tello11c3b332018-01-25 15:05:13 +000073 }
74 }
75
Pablo Telloeb82fd22018-02-23 13:43:50 +000076 __asm __volatile(
77 "LDR q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
78 "STP q0, q1, [%[outptr]], #32\n"
79 "LDR q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP q0, q1, [%[outptr]], #32\n"
80 : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
81 [outptr] "+r"(outptr)
Pablo Tello11c3b332018-01-25 15:05:13 +000082 :
Pablo Telloeb82fd22018-02-23 13:43:50 +000083 : "v0", "v1");
Pablo Tello11c3b332018-01-25 15:05:13 +000084 }
85
Pablo Telloeb82fd22018-02-23 13:43:50 +000086 if(x > 0)
87 {
Pablo Tello11c3b332018-01-25 15:05:13 +000088 /* Need to duplicate this here, in case we didn't run the main loop. */
Pablo Telloeb82fd22018-02-23 13:43:50 +000089 if((y + 3) >= ymax)
90 {
91 switch((y + 3) - ymax)
92 {
Pablo Tello11c3b332018-01-25 15:05:13 +000093 /* Everything falls through in here */
94 case 2:
95 inptr1 = zerobuff;
96 case 1:
97 inptr2 = zerobuff;
98 case 0:
99 inptr3 = zerobuff;
Pablo Tello11c3b332018-01-25 15:05:13 +0000100 break;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000101
102 default:
103 UNREACHABLE("Impossible.");
Pablo Tello11c3b332018-01-25 15:05:13 +0000104 }
105 }
106
107 /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
Pablo Telloeb82fd22018-02-23 13:43:50 +0000108 auto f = [&outptr, x](const uint8_t *&p)
109 {
110 for(int i = 0; i < 16; i++)
111 {
112 if(i < x)
113 {
Pablo Tello11c3b332018-01-25 15:05:13 +0000114 *outptr++ = *p++;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000115 }
116 else
117 {
Pablo Tello11c3b332018-01-25 15:05:13 +0000118 *outptr++ = 0;
119 }
120 }
121 };
122
123 f(inptr0);
124 f(inptr1);
125 f(inptr2);
126 f(inptr3);
127 }
128 }
129}
130
Pablo Telloeb82fd22018-02-23 13:43:50 +0000131#endif // __aarch64__