blob: 15cc04b35231701b16acc79830b373ed216b9754 [file] [log] [blame]
Pablo Tello9ceebbe2018-01-10 16:44:13 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
Georgios Pinitas4074c992018-01-30 18:13:46 +000025#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
Pablo Tellod3d97d22018-10-05 10:59:48 +010026#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000027#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
Pablo Tello9ceebbe2018-01-10 16:44:13 +000028
Pablo Tellod3d97d22018-10-05 10:59:48 +010029namespace
Pablo Tello9ceebbe2018-01-10 16:44:13 +000030{
31
Pablo Tellod3d97d22018-10-05 10:59:48 +010032template <bool Specialized, int PadBottom=0, int PadRight=0>
33void winograd_output_transform_4x4_3x3_fp32_process_tile(
Pablo Tello9ceebbe2018-01-10 16:44:13 +000034 const int n_channels,
35 const float* const matrix_base,
36 const int matrix_stride,
Pablo Tellod6ca4782018-01-23 09:36:04 +000037 const float* const biases,
Pablo Tello9ceebbe2018-01-10 16:44:13 +000038 float* const output,
39 const int output_row_stride,
Pablo Tellod3d97d22018-10-05 10:59:48 +010040 const int output_col_stride,
41 const int _pad_bottom,
42 const int _pad_right
Pablo Tello9ceebbe2018-01-10 16:44:13 +000043)
44{
Pablo Tellod3d97d22018-10-05 10:59:48 +010045 const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
46 const int pad_right = Specialized ? PadRight : _pad_right;
47 constexpr int TileRows = 4, TileCols = 4;
48
49 const int cells_i = TileRows - pad_bottom;
50 const int cells_j = TileCols - pad_right;
Pablo Tello9ceebbe2018-01-10 16:44:13 +000051
52 // Construct a map to the output cells
Pablo Tellod3d97d22018-10-05 10:59:48 +010053 float *outptrs[TileRows][TileCols];
Pablo Tello9ceebbe2018-01-10 16:44:13 +000054 for (int i = 0; i < cells_i; i++)
55 {
56 for (int j = 0; j < cells_j; j++)
57 {
58 outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
59 }
60 }
61 const float *inptr = matrix_base;
Pablo Tellod6ca4782018-01-23 09:36:04 +000062 const float *bptr = biases;
Pablo Tello9ceebbe2018-01-10 16:44:13 +000063
Andrew Mundy4d9379a2018-03-15 16:47:03 +000064 if (bptr)
Pablo Tello9ceebbe2018-01-10 16:44:13 +000065 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +000066 // For each channel of the output
67 int channels_remaining = n_channels;
68#ifdef __aarch64__
69 for (; channels_remaining >= 4; channels_remaining -= 4)
Pablo Tello9ceebbe2018-01-10 16:44:13 +000070 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +000071 // Matrices used and computed during this transform
72 float32x4_t F[6][6], FZ[6][4], f[4][4], b;
73
74 // Read a 6x6 tile in the Winograd domain
75 for (int i = 0, m = 0; i < 6; i++)
Pablo Tello9ceebbe2018-01-10 16:44:13 +000076 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +000077 for (int j = 0; j < 6; j++, m++)
78 {
79 F[i][j] = vld1q_f32(inptr + m*matrix_stride);
80 }
81 }
82 inptr += 4;
83
84 // Compute the matrix F Z
85 for (int i = 0; i < 6; i++)
86 {
87 // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
88 FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
89
90 // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
91 FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
92
93 // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
94 FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
95
96 // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
97 FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
98 }
99
100 // Compute the output tile f = ZT F Z
101 for (int j = 0; j < 4; j++)
102 {
103 // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
104 f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
105
106 // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
107 f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
108
109 // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
110 f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
111
112 // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
113 f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
114 }
115
116 // Write out the output tile
117 b = vld1q_f32(bptr);
118 bptr += 4;
119 for (int i = 0; i < cells_i; i++)
120 {
121 for (int j = 0; j < cells_j; j++)
122 {
123 vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
124 outptrs[i][j] += 4;
125 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000126 }
127 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000128#endif // __aarch64__
129#ifdef __arm_any__
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000130 for (; channels_remaining >= 2; channels_remaining -= 2)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000131 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000132 // Matrices used and computed during this transform
133 float32x2_t F[6][6], FZ[6][4], f[4][4], b;
134
135 // Read a 6x6 tile in the Winograd domain
136 for (int i = 0, m = 0; i < 6; i++)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000137 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000138 for (int j = 0; j < 6; j++, m++)
139 {
140 F[i][j] = vld1_f32(inptr + m*matrix_stride);
141 }
142 }
143 inptr += 2;
144
145 // Compute the matrix F Z
146 for (int i = 0; i < 6; i++)
147 {
148 // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
149 FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
150
151 // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
152 FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
153
154 // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
155 FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
156
157 // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
158 FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
159 }
160
161 // Compute the output tile f = ZT F Z
162 for (int j = 0; j < 4; j++)
163 {
164 // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
165 f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
166
167 // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
168 f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
169
170 // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
171 f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
172
173 // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
174 f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
175 }
176
177 // Write out the output tile
178 b = vld1_f32(bptr);
179 bptr += 2;
180 for (int i = 0; i < cells_i; i++)
181 {
182 for (int j = 0; j < cells_j; j++)
183 {
184 vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
185 outptrs[i][j] += 2;
186 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000187 }
188 }
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000189#endif
190 for (; channels_remaining; channels_remaining--)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000191 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000192 // Matrices used and computed during this transform
193 float F[6][6], FZ[6][4], f[4][4], b;
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000194
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000195 // Read a 6x6 tile in the Winograd domain
196 for (int i = 0, m = 0; i < 6; i++)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000197 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000198 for (int j = 0; j < 6; j++, m++)
199 {
200 F[i][j] = *(inptr + m*matrix_stride);
201 }
202 }
203 inptr++;
204
205 // Compute the matrix F Z
206 for (int i = 0; i < 6; i++)
207 {
208 FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
209 FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
210 FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
211 FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
212 }
213
214 // Compute the output tile f = ZT F Z
215 for (int j = 0; j < 4; j++)
216 {
217 f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
218 f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
219 f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
220 f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
221 }
222
223 // Write out the output tile
224 b = *(bptr++);
225 for (int i = 0; i < cells_i; i++)
226 {
227 for (int j = 0; j < cells_j; j++)
228 {
229 *(outptrs[i][j]++) = f[i][j] + b;
230 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000231 }
232 }
233 }
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000234 else
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000235 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000236 // For each channel of the output
237 int channels_remaining = n_channels;
238#ifdef __aarch64__
239 for (; channels_remaining >= 4; channels_remaining -= 4)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000240 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000241 // Matrices used and computed during this transform
242 float32x4_t F[6][6], FZ[6][4], f[4][4];
243
244 // Read a 6x6 tile in the Winograd domain
245 for (int i = 0, m = 0; i < 6; i++)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000246 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000247 for (int j = 0; j < 6; j++, m++)
248 {
249 F[i][j] = vld1q_f32(inptr + m*matrix_stride);
250 }
251 }
252 inptr += 4;
253
254 // Compute the matrix F Z
255 for (int i = 0; i < 6; i++)
256 {
257 // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
258 FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
259
260 // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
261 FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
262
263 // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
264 FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
265
266 // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
267 FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
268 }
269
270 // Compute the output tile f = ZT F Z
271 for (int j = 0; j < 4; j++)
272 {
273 // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
274 f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
275
276 // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
277 f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
278
279 // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
280 f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
281
282 // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
283 f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
284 }
285
286 // Write out the output tile
287 for (int i = 0; i < cells_i; i++)
288 {
289 for (int j = 0; j < cells_j; j++)
290 {
291 vst1q_f32(outptrs[i][j], f[i][j]);
292 outptrs[i][j] += 4;
293 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000294 }
295 }
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000296#endif // __aarch64__
297#ifdef __arm_any__
298 for (; channels_remaining >= 2; channels_remaining -= 2)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000299 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000300 // Matrices used and computed during this transform
301 float32x2_t F[6][6], FZ[6][4], f[4][4];
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000302
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000303 // Read a 6x6 tile in the Winograd domain
304 for (int i = 0, m = 0; i < 6; i++)
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000305 {
Andrew Mundy4d9379a2018-03-15 16:47:03 +0000306 for (int j = 0; j < 6; j++, m++)
307 {
308 F[i][j] = vld1_f32(inptr + m*matrix_stride);
309 }
310 }
311 inptr += 2;
312
313 // Compute the matrix F Z
314 for (int i = 0; i < 6; i++)
315 {
316 // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
317 FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
318
319 // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
320 FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
321
322 // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
323 FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
324
325 // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
326 FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
327 }
328
329 // Compute the output tile f = ZT F Z
330 for (int j = 0; j < 4; j++)
331 {
332 // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
333 f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
334
335 // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
336 f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
337
338 // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
339 f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
340
341 // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
342 f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
343 }
344
345 // Write out the output tile
346 for (int i = 0; i < cells_i; i++)
347 {
348 for (int j = 0; j < cells_j; j++)
349 {
350 vst1_f32(outptrs[i][j], f[i][j]);
351 outptrs[i][j] += 2;
352 }
353 }
354 }
355#endif
356 for (; channels_remaining; channels_remaining--)
357 {
358 // Matrices used and computed during this transform
359 float F[6][6], FZ[6][4], f[4][4];
360
361 // Read a 6x6 tile in the Winograd domain
362 for (int i = 0, m = 0; i < 6; i++)
363 {
364 for (int j = 0; j < 6; j++, m++)
365 {
366 F[i][j] = *(inptr + m*matrix_stride);
367 }
368 }
369 inptr++;
370
371 // Compute the matrix F Z
372 for (int i = 0; i < 6; i++)
373 {
374 FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
375 FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
376 FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
377 FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
378 }
379
380 // Compute the output tile f = ZT F Z
381 for (int j = 0; j < 4; j++)
382 {
383 f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
384 f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
385 f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
386 f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
387 }
388
389 // Write out the output tile
390 for (int i = 0; i < cells_i; i++)
391 {
392 for (int j = 0; j < cells_j; j++)
393 {
394 *(outptrs[i][j]++) = f[i][j];
395 }
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000396 }
397 }
398 }
399}
400
Pablo Tellod3d97d22018-10-05 10:59:48 +0100401} // namespace (anonymous)
402
403namespace winograd
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000404{
Pablo Tellod3d97d22018-10-05 10:59:48 +0100405using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
406
407template <>
408const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
409
410template <>
411const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
412
413template <>
414const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
415 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
416 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
417 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000418};
419
Pablo Tellod3d97d22018-10-05 10:59:48 +0100420template <>
421const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
422 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
423 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
424 winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
425};
426
427template class OutputTransform<3, 3, 6, 6, float>;
Pablo Tello9ceebbe2018-01-10 16:44:13 +0000428} // namespace winograd