blob: bea455ca67542b611ba7dff5378b286148565b86 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2018 Arm Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __arm__
27
28#include <arm_neon.h>
29
Anthony Barbier5f707732018-07-03 16:22:02 +010030template<>
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010031void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float *bias, Activation act, bool append) {
Pablo Telloeb82fd22018-02-23 13:43:50 +000032 const float *inptr = in;
33 prefetch_6x(inptr);
34 prefetch_6x(inptr + 96);
35
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010036 float nullbias[8];
37 float minval = - std::numeric_limits<float>::infinity();
38 float maxval = std::numeric_limits<float>::infinity();
39
40 switch(act.type)
41 {
42 default:
43 case Activation::Type::None:
44 break;
45 case Activation::Type::BoundedReLU:
46 maxval = static_cast<float>(act.param1);
47 /* fall through */
48 case Activation::Type::ReLU:
49 minval = 0.0f;
50 break;
51 }
52
53 float32x4_t minv = vdupq_n_f32(minval);
54 float32x4_t maxv = vdupq_n_f32(maxval);
55
56 if (!append && !bias)
57 {
58 memset(nullbias, 0, (8 * sizeof(float)));
59 }
Pablo Telloeb82fd22018-02-23 13:43:50 +000060
Anthony Barbier5f707732018-07-03 16:22:02 +010061 for (int y=y0; y<ymax; y+=8) {
Pablo Telloeb82fd22018-02-23 13:43:50 +000062 float *outptr0 = out + (y * ldout) + x0;
63 float *outptr1 = outptr0 + ldout;
64 float *outptr2 = outptr1 + ldout;
65 float *outptr3 = outptr2 + ldout;
66 float *outptr4 = outptr3 + ldout;
67 float *outptr5 = outptr4 + ldout;
68
69 prefetch_2x(outptr0);
70 prefetch_2x(outptr1);
71 prefetch_2x(outptr2);
72 prefetch_2x(outptr3);
73 prefetch_2x(outptr4);
74 prefetch_2x(outptr5);
75
Anthony Barbier5f707732018-07-03 16:22:02 +010076 for (int i=x0; i<xmax; i+=8) {
Pablo Telloeb82fd22018-02-23 13:43:50 +000077 float dummyres[8];
78
79 /* Make sure we throw away results if Y isn't a multiple of 8.
80 * We do this by pointing the result pointer at a dummy buffer
81 * we later discard. */
Anthony Barbier5f707732018-07-03 16:22:02 +010082 if ((y+5) >= ymax) {
83 switch ((y + 5) - ymax) {
Pablo Telloeb82fd22018-02-23 13:43:50 +000084 case 4:
85 outptr1 = dummyres;
Georgios Pinitasf2cdce32019-12-09 18:35:57 +000086 /* fall through */
Pablo Telloeb82fd22018-02-23 13:43:50 +000087 case 3:
88 outptr2 = dummyres;
Georgios Pinitasf2cdce32019-12-09 18:35:57 +000089 /* fall through */
Pablo Telloeb82fd22018-02-23 13:43:50 +000090 case 2:
91 outptr3 = dummyres;
Georgios Pinitasf2cdce32019-12-09 18:35:57 +000092 /* fall through */
Pablo Telloeb82fd22018-02-23 13:43:50 +000093 case 1:
94 outptr4 = dummyres;
Georgios Pinitasf2cdce32019-12-09 18:35:57 +000095 /* fall through */
Pablo Telloeb82fd22018-02-23 13:43:50 +000096 case 0:
97 outptr5 = dummyres;
98 break;
99
100 default:
101 UNREACHABLE("Impossible.");
102 }
103 }
104
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100105 if (append) {
106 /* Append mode: Read, activate, write. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100107
108 /* For ragged X, manually copy over the valid results. */
109 if ((i+7) >= xmax) {
110 for (int xi=0; xi<8; xi++) {
111 if ((i+xi) < xmax) {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100112 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100113 outptr0++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100114 *outptr1 = std::min(std::max(minval, inptr[xi + 8] + *outptr1), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100115 outptr1++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100116 *outptr2 = std::min(std::max(minval, inptr[xi + 16] + *outptr2), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100117 outptr2++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100118 *outptr3 = std::min(std::max(minval, inptr[xi + 24] + *outptr3), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100119 outptr3++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100120 *outptr4 = std::min(std::max(minval, inptr[xi + 32] + *outptr4), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100121 outptr4++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100122 *outptr5 = std::min(std::max(minval, inptr[xi + 40] + *outptr5), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100123 outptr5++;
124 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000125 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100126 inptr += 48;
127 } else {
128 /* Optimized routine to copy an entire block */
129 __asm __volatile (
130 // Rows 0-1
131 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100132 "VLD1.32 {d8-d11}, [%[outptr0]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100133 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100134 "VLD1.32 {d12-d15}, [%[outptr1]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100135
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100136 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100137 ASM_PREFETCH("[%[inptr], #352]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100138 "VADD.f32 q5, q5, q1\n"
139 "VADD.f32 q6, q6, q2\n"
140 "VADD.f32 q7, q7, q3\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100141 ASM_PREFETCH("[%[inptr], #416]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100142 "VMAX.f32 q4, q4, %q[minv]\n"
143 "VMAX.f32 q5, q5, %q[minv]\n"
144 "VMAX.f32 q6, q6, %q[minv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100145 ASM_PREFETCH("[%[inptr], #480]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100146 "VMAX.f32 q7, q7, %q[minv]\n"
147 "VMIN.f32 q4, q4, %q[maxv]\n"
148 "VMIN.f32 q5, q5, %q[maxv]\n"
149 "VST1.32 {d8-d11}, [%[outptr0]]!\n"
150 "VMIN.f32 q6, q6, %q[maxv]\n"
151 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100152 "VST1.32 {d12-d15}, [%[outptr1]]!\n"
153
154 // Rows 2-3
155 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100156 "VLD1.32 {d8-d11}, [%[outptr2]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100157 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100158 "VLD1.32 {d12-d15}, [%[outptr3]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100159
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100160 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100161 ASM_PREFETCH("[%[outptr0], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100162 "VADD.f32 q5, q5, q1\n"
163 "VADD.f32 q6, q6, q2\n"
164 "VADD.f32 q7, q7, q3\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100165 ASM_PREFETCH("[%[outptr1], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100166 "VMAX.f32 q4, q4, %q[minv]\n"
167 "VMAX.f32 q5, q5, %q[minv]\n"
168 "VMAX.f32 q6, q6, %q[minv]\n"
169 ASM_PREFETCH("[%[outptr2], #128]")
170 "VMAX.f32 q7, q7, %q[minv]\n"
171 "VMIN.f32 q4, q4, %q[maxv]\n"
172 "VMIN.f32 q5, q5, %q[maxv]\n"
173 "VST1.32 {d8-d11}, [%[outptr2]]!\n"
174 "VMIN.f32 q6, q6, %q[maxv]\n"
175 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100176 "VST1.32 {d12-d15}, [%[outptr3]]!\n"
177
178 // Rows 4-5
179 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100180 "VLD1.32 {d8-d11}, [%[outptr4]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100181 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100182 "VLD1.32 {d12-d15}, [%[outptr5]]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100183
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100184 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100185 ASM_PREFETCH("[%[outptr3], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100186 "VADD.f32 q5, q5, q1\n"
187 "VADD.f32 q6, q6, q2\n"
188 "VADD.f32 q7, q7, q3\n"
189 ASM_PREFETCH("[%[outptr4], #128]")
190 "VMAX.f32 q4, q4, %q[minv]\n"
191 "VMAX.f32 q5, q5, %q[minv]\n"
192 "VMAX.f32 q6, q6, %q[minv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100193 ASM_PREFETCH("[%[outptr5], #128]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100194 "VMAX.f32 q7, q7, %q[minv]\n"
195 "VMIN.f32 q4, q4, %q[maxv]\n"
196 "VMIN.f32 q5, q5, %q[maxv]\n"
197 "VST1.32 {d8-d11}, [%[outptr4]]!\n"
198 "VMIN.f32 q6, q6, %q[maxv]\n"
199 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100200 "VST1.32 {d12-d15}, [%[outptr5]]!\n"
201 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
202 [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100203 : [minv] "w" (minv), [maxv] "w" (maxv)
204 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
Anthony Barbier5f707732018-07-03 16:22:02 +0100205 );
Pablo Telloeb82fd22018-02-23 13:43:50 +0000206 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100207 } else {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100208 /* Bias mode: Add bias to everything, then min/max/write as before. */
209 const float *biasptr = bias ? bias + i : nullbias;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000210
Anthony Barbier5f707732018-07-03 16:22:02 +0100211 /* For ragged X, manually copy over the valid results. */
212 if ((i+7) >= xmax) {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100213 for (int xi=0; xi<7; xi++) {
Anthony Barbier5f707732018-07-03 16:22:02 +0100214 if ((i+xi) < xmax) {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100215 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100216 outptr0++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100217 *outptr1 = std::min(std::max(minval, inptr[xi + 8] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100218 outptr1++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100219 *outptr2 = std::min(std::max(minval, inptr[xi + 16] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100220 outptr2++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100221 *outptr3 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100222 outptr3++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100223 *outptr4 = std::min(std::max(minval, inptr[xi + 32] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100224 outptr4++;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100225 *outptr5 = std::min(std::max(minval, inptr[xi + 40] + biasptr[xi]), maxval);
Anthony Barbier5f707732018-07-03 16:22:02 +0100226 outptr5++;
227 }
228 }
229 inptr += 48;
230 } else {
231 /* Optimized routine to copy an entire block */
232 __asm __volatile (
233 // Rows 0-1
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100234 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
235 "VLD1.32 {d0-d3}, [%[biasptr]]\n"
236 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
Pablo Telloeb82fd22018-02-23 13:43:50 +0000237
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100238 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100239 ASM_PREFETCH("[%[inptr], #352]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100240 "VADD.f32 q5, q5, q1\n"
241 "VADD.f32 q6, q6, q0\n"
242 "VADD.f32 q7, q7, q1\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100243 ASM_PREFETCH("[%[inptr], #416]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100244 "VMAX.f32 q4, q4, %q[minv]\n"
245 "VMAX.f32 q5, q5, %q[minv]\n"
246 "VMAX.f32 q6, q6, %q[minv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100247 ASM_PREFETCH("[%[inptr], #480]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100248 "VMAX.f32 q7, q7, %q[minv]\n"
249 "VMIN.f32 q4, q4, %q[maxv]\n"
250 "VMIN.f32 q5, q5, %q[maxv]\n"
251 "VST1.32 {d8-d11}, [%[outptr0]]!\n"
252 "VMIN.f32 q6, q6, %q[maxv]\n"
253 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100254 "VST1.32 {d12-d15}, [%[outptr1]]!\n"
Pablo Telloeb82fd22018-02-23 13:43:50 +0000255
Anthony Barbier5f707732018-07-03 16:22:02 +0100256 // Rows 2-3
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100257 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
258 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
Pablo Telloeb82fd22018-02-23 13:43:50 +0000259
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100260 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100261 ASM_PREFETCH("[%[outptr0], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100262 "VADD.f32 q5, q5, q1\n"
263 "VADD.f32 q6, q6, q0\n"
264 "VADD.f32 q7, q7, q1\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100265 ASM_PREFETCH("[%[outptr1], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100266 "VMAX.f32 q4, q4, %q[minv]\n"
267 "VMAX.f32 q5, q5, %q[minv]\n"
268 "VMAX.f32 q6, q6, %q[minv]\n"
269 ASM_PREFETCH("[%[outptr2], #128]")
270 "VMAX.f32 q7, q7, %q[minv]\n"
271 "VMIN.f32 q4, q4, %q[maxv]\n"
272 "VMIN.f32 q5, q5, %q[maxv]\n"
273 "VST1.32 {d8-d11}, [%[outptr2]]!\n"
274 "VMIN.f32 q6, q6, %q[maxv]\n"
275 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100276 "VST1.32 {d12-d15}, [%[outptr3]]!\n"
Pablo Telloeb82fd22018-02-23 13:43:50 +0000277
Anthony Barbier5f707732018-07-03 16:22:02 +0100278 // Rows 4-5
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100279 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
280 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100281
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100282 "VADD.f32 q4, q4, q0\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100283 ASM_PREFETCH("[%[outptr3], #96]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100284 "VADD.f32 q5, q5, q1\n"
285 "VADD.f32 q6, q6, q0\n"
286 "VADD.f32 q7, q7, q1\n"
287 ASM_PREFETCH("[%[outptr4], #128]")
288 "VMAX.f32 q4, q4, %q[minv]\n"
289 "VMAX.f32 q5, q5, %q[minv]\n"
290 "VMAX.f32 q6, q6, %q[minv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100291 ASM_PREFETCH("[%[outptr5], #128]")
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100292 "VMAX.f32 q7, q7, %q[minv]\n"
293 "VMIN.f32 q4, q4, %q[maxv]\n"
294 "VMIN.f32 q5, q5, %q[maxv]\n"
295 "VST1.32 {d8-d11}, [%[outptr4]]!\n"
296 "VMIN.f32 q6, q6, %q[maxv]\n"
297 "VMIN.f32 q7, q7, %q[maxv]\n"
Anthony Barbier5f707732018-07-03 16:22:02 +0100298 "VST1.32 {d12-d15}, [%[outptr5]]!\n"
299 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
300 [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100301 : [minv] "w" (minv), [maxv] "w" (maxv), [biasptr] "r" (biasptr)
302 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
Anthony Barbier5f707732018-07-03 16:22:02 +0100303 );
304 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000305 }
306 }
307 }
308}
309
310#endif // __arm__