blob: 2228f80e5112342b872eeba12238795803d5299d [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
27/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
28 *
29 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
30 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
31 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
32 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
33 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
34 *
35 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
36 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
37 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
38 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
39 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
40 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
41 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
42 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
43 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
44 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
45 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
47 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
49 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
51 */
52__kernel void winograd_output_transform_2x2_3x3_nchw(
53 TENSOR3D_DECLARATION(src),
54 TENSOR3D_DECLARATION(dst)
55#if defined(HAS_BIAS)
56 ,
57 VECTOR_DECLARATION(bias)
58#endif // defined(HAS_BIAS)
59)
60{
61 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
62 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
63
64 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
65
66 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
67 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
68 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
69 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
70 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
71
72#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
73 // Compute the 2x1 or 1x2 output tile
74 // out00 = d00 + d01 + d02
75 // out01 = d01 - d02 - d03
76
77 float out00 = d00 + d01 + d02;
78 float out01 = d01 - d02 - d03;
79#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
80 float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
81 float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
82 float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
83 float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
84
85 float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
86 float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
87 float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
88 float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
89
90 float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
91 float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
92 float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
93 float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
94
95 // Compute the 2x2 output tile
96 float k0 = d01 + d11 + d21;
97 float k1 = d02 + d12 + d22;
98 float k2 = d11 - d21 - d31;
99 float k3 = d12 - d22 - d32;
100
101 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
102 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
103 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
104 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
105
106 float out00 = d10;
107 float out01 = -d13;
108 float out10 = d10;
109 float out11 = -d13;
110
111 out00 += d00 + d20 + k0 + k1;
112 out01 += k0 - k1 - (d03 + d23);
113 out10 += -d20 - d30 + k2 + k3;
114 out11 += k2 - k3 + d23 + d33;
115#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
116
117 int y_in = get_global_id(1);
118 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
119 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
120 int z_out = get_global_id(0);
121
122#if defined(HAS_BIAS)
123 // Add bias
124 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
125
126 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
127
128 out00 += (float)b;
129 out01 += (float)b;
130#endif // defined(HAS_BIAS)
131
132 // Get output address
133 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
134
135 // Store the output tile
136#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
137 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
138 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
139#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
140 vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
141#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
142
143#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
144#if defined(HAS_BIAS)
145 // Add bias
146 out10 += (float)b;
147 out11 += (float)b;
148#endif // defined(HAS_BIAS)
149
150 vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
151#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
152}
153
154/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
155 *
156 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
157 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
158 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
159 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
160 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
161 *
162 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
163 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
164 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
165 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
166 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
167 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
168 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
169 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
170 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
171 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
172 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
173 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
174 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
175 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
176 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
177 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
178 */
179__kernel void winograd_output_transform_4x4_3x3_nchw(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst)
182#if defined(HAS_BIAS)
183 ,
184 VECTOR_DECLARATION(bias)
185#endif // defined(HAS_BIAS)
186)
187{
188 // Each thread stores a 4x4/4x1 or 1x4 tile
189 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
190
191 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
192
193 // Load the values across the channels to compose the 6x6 or 6x1 tile
194 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
195 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
196 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
197 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
198 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
199 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
200
201#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
202 // Compute out00, out01, out02 and out03
203 float out00 = d00 + d01 + d02 + d03 + d04;
204 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
205 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
206 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
207#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
208 float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
209 float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
210 float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
211 float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
212 float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
213 float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
214
215 float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
216 float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
217 float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
218 float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
219 float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
220 float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
221
222 float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
223 float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
224 float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
225 float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
226 float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
227 float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
228
229 float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
230 float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
231 float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
232 float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
233 float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
234 float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
235
236 float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
237 float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
238 float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
239 float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
240 float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
241 float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
242
243 // Compute out00, out01, out02 and out03
244 float out00 = d01 + d21 + d41 + d11 + d31;
245 float out01 = d01 + d21 + d41 + d11 + d31;
246 float out02 = d01 + d21 + d41 + d11 + d31;
247 float out03 = d01 + d21 + d41 + d11 + d31;
248
249 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
250 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
251
252 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
253 out01 += k1 - d02 - d12 - d22 - d32 - d42;
254 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
255 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
256
257 // Compute out10, out11, out12 and out13
258 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
259 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
260 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
261 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
262
263 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
264 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
265
266 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
267 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
268 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
269 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
270
271 // Compute out20, out21, out22 and out23
272 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
273 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
274 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
275 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
276
277 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
278 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
279
280 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
281 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
282 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
283 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
284
285 // Compute out30, out31, out32 and out33
286 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
287 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
288 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
289 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
290
291 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
292 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
293
294 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
295 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
296 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
297 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
298#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
299
300 int y_in = get_global_id(1);
301 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
302 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
303 int z_out = get_global_id(0);
304
305#if defined(HAS_BIAS)
306 // Add bias
307 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
308
309 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
310
311 out00 += (float)b;
312 out01 += (float)b;
313 out02 += (float)b;
314 out03 += (float)b;
315#endif // defined(HAS_BIAS)
316
317 // Get output address
318 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
319
320 // Store the output tile
321#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
322 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
323 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
324 *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
325 *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
326#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
327 vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
328#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
329
330#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
331#if defined(HAS_BIAS)
332 // Add bias
333 out10 += (float)b;
334 out11 += (float)b;
335 out12 += (float)b;
336 out13 += (float)b;
337
338 out20 += (float)b;
339 out21 += (float)b;
340 out22 += (float)b;
341 out23 += (float)b;
342
343 out30 += (float)b;
344 out31 += (float)b;
345 out32 += (float)b;
346 out33 += (float)b;
347#endif // defined(HAS_BIAS)
348 vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
349 vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
350 vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
351#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
352}
353
Giorgio Arena149fdf32018-07-04 17:03:33 +0100354/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100355 *
356 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100357 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
358 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
359 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
360 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100361 *
362 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
363 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
364 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
365 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
366 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
367 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
368 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
369 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
370 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
371 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
372 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
373 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
374 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
375 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
376 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
377 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
378 * @param[in] dst_size Size of the destination tensor, minus the last padding
379 */
380__kernel void winograd_output_transform_4x4_3x3_nhwc(
381 TENSOR3D_DECLARATION(src),
382 TENSOR3D_DECLARATION(dst),
383#if defined(HAS_BIAS)
384 VECTOR_DECLARATION(bias),
385#endif // defined(HAS_BIAS)
386 int dst_size)
387{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100388 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100389 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
390
391 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
392
Giorgio Arena149fdf32018-07-04 17:03:33 +0100393 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100394 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
395 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
396 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
397 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
398 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
399 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
400
Giorgio Arena149fdf32018-07-04 17:03:33 +0100401#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
402 // Compute out00, out01, out02 and out03
403 float out00 = d00 + d01 + d02 + d03 + d04;
404 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
405 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
406 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
407#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
408
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100409 float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
410 float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
411 float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
412 float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
413 float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
414 float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
415
416 float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
417 float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
418 float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
419 float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
420 float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
421 float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
422
423 float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
424 float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
425 float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
426 float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
427 float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
428 float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
429
430 float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
431 float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
432 float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
433 float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
434 float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
435 float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
436
437 float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
438 float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
439 float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
440 float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
441 float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
442 float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
443
444 // Compute out00, out01, out02 and out03
445 float out00 = d01 + d21 + d41 + d11 + d31;
446 float out01 = d01 + d21 + d41 + d11 + d31;
447 float out02 = d01 + d21 + d41 + d11 + d31;
448 float out03 = d01 + d21 + d41 + d11 + d31;
449
450 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
451 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
452
453 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
454 out01 += k1 - d02 - d12 - d22 - d32 - d42;
455 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
456 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
457
458 // Compute out10, out11, out12 and out13
459 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
460 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
461 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
462 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
463
464 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
465 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
466
467 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
468 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
469 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
470 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
471
472 // Compute out20, out21, out22 and out23
473 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
474 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
475 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
476 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
477
478 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
479 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
480
481 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
482 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
483 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
484 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
485
486 // Compute out30, out31, out32 and out33
487 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
488 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
489 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
490 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
491
492 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
493 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
494
495 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
496 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
497 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
498 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100499#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100500
501 int y_in = get_global_id(1);
502 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100503 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
504 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100505
506#if defined(HAS_BIAS)
507 // Add bias
508 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
509
Giorgio Arenaea55f912018-07-12 15:41:35 +0100510 float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100511
512 out00 += (float)b;
513 out01 += (float)b;
514 out02 += (float)b;
515 out03 += (float)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100516#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100517 out10 += (float)b;
518 out11 += (float)b;
519 out12 += (float)b;
520 out13 += (float)b;
521
522 out20 += (float)b;
523 out21 += (float)b;
524 out22 += (float)b;
525 out23 += (float)b;
526
527 out30 += (float)b;
528 out31 += (float)b;
529 out32 += (float)b;
530 out33 += (float)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100531#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100532
533#endif // defined(HAS_BIAS)
534
Giorgio Arena149fdf32018-07-04 17:03:33 +0100535#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
536 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
537 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
538
539 // Store the 1x4 output tile
540 *((__global float *)(dst_ptr + offset.s0)) = out00;
541 *((__global float *)(dst_ptr + offset.s1)) = out01;
542 *((__global float *)(dst_ptr + offset.s2)) = out02;
543 *((__global float *)(dst_ptr + offset.s3)) = out03;
544#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
545 // Store the 4x1 output tile
Giorgio Arenad02eb452018-07-18 11:45:30 +0100546 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
547 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100548
Giorgio Arenad02eb452018-07-18 11:45:30 +0100549 *((__global float *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;
550 *((__global float *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;
551 *((__global float *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;
552 *((__global float *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100553#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100554 // Get output address
555 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
556 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
557 int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
558
559 // Store the 4x4 output tile
560 *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
561 *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
562 *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
563 *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
564 *((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
565 *((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
566 *((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
567 *((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
568 *((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
569 *((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
570 *((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
571 *((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
572 *((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
573 *((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
574 *((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
575 *((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100576
577#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100578}
579
580#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
581 ({ \
582 comm_fact.s0 = d1 + d2; \
583 comm_fact.s1 = d3 + d4; \
584 comm_fact.s2 = d5 + d6; \
585 \
586 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
587 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
588 \
589 comm_fact.s0 = d1 - d2; \
590 comm_fact.s1 = d3 - d4; \
591 comm_fact.s2 = d5 - d6; \
592 \
593 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
594 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
595 })
596
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100597/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100598 *
599 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100600 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
601 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
602 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
603 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100604 *
605 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
606 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
607 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
608 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
609 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
610 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
611 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
612 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
613 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
614 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
615 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
616 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
617 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
618 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
619 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
620 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
621 */
622__kernel void winograd_output_transform_4x4_5x5_nchw(
623 TENSOR3D_DECLARATION(src),
624 TENSOR3D_DECLARATION(dst)
625#if defined(HAS_BIAS)
626 ,
627 VECTOR_DECLARATION(bias)
628#endif // defined(HAS_BIAS)
629)
630{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100631 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100632 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
633
634 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
635
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100636 // Compute output address
637 int y_in = get_global_id(1);
638 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
639 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
640 int z_out = get_global_id(0);
641
642 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
643
644 // Load the values across the channels to compose the input tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100645 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
646 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
647 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
648 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
649 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
650 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
651 float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
652 float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
653
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100654#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
655 // Compute out00, out01, out02 and out03
656 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
657 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
658 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
659 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
660
661#if defined(HAS_BIAS)
662 // Add bias
663 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
664
665 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
666
667 out00 += (float)b;
668 out01 += (float)b;
669 out02 += (float)b;
670 out03 += (float)b;
671#endif // defined(HAS_BIAS)
672
673 // Store the output tile
674#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
675 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
676 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
677 *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
678 *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
679#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
680 vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));
681#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
682
683#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad02eb452018-07-18 11:45:30 +0100684 float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
685 float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
686 float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
687 float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
688 float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
689 float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
690 float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
691 float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100692
693 float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
694 float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
695 float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
696 float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
697 float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
698 float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
699 float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
700 float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
701
702 float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
703 float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
704 float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
705 float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
706 float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
707 float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
708 float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
709 float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
710
711 float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
712 float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
713 float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
714 float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
715 float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
716 float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
717 float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
718 float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
719
720 float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
721 float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
722 float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
723 float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
724 float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
725 float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
726 float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
727 float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
728
729 float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
730 float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
731 float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
732 float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
733 float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
734 float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
735 float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
736 float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
737
738 float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
739 float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
740 float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
741 float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
742 float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
743 float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
744 float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
745 float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
746
747 // Compute the 8x4 intermediate tensor
748 float4 comm_fact0, comm_fact1, comm_fact2;
749 float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
750
751 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
752 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
753 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
754 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
755 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
756 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
757 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
758 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
759
760 // Compute the 4x4 output tile
761 comm_fact0 = tmp_col1 + tmp_col2;
762 comm_fact1 = tmp_col3 + tmp_col4;
763 comm_fact2 = tmp_col5 + tmp_col6;
764
765 float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
766 float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
767
768 comm_fact0 = tmp_col1 - tmp_col2;
769 comm_fact1 = tmp_col3 - tmp_col4;
770 comm_fact2 = tmp_col5 - tmp_col6;
771
772 float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
773 float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
774
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100775#if defined(HAS_BIAS)
776 // Add bias
777 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
778
779 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
780
781 out_col0 += (float4)b;
782 out_col1 += (float4)b;
783 out_col2 += (float4)b;
784 out_col3 += (float4)b;
785#endif // defined(HAS_BIAS)
786
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100787 // Store the output tile
788 vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
789 vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
790 vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
791 vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
792#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100793}
794
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100795/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100796 *
797 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100798 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
799 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
800 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
801 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100802 *
803 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
804 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
805 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
806 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
807 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
808 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
809 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
810 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
811 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
812 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
813 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
814 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
815 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
816 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
817 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
818 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
819 */
820__kernel void winograd_output_transform_4x4_5x5_nhwc(
821 TENSOR3D_DECLARATION(src),
822 TENSOR3D_DECLARATION(dst),
823#if defined(HAS_BIAS)
824 VECTOR_DECLARATION(bias),
825#endif // defined(HAS_BIAS)
826 int dst_size)
827{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100828 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100829 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
830
831 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
832
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100833 int y_in = get_global_id(1);
834 int x_out = get_global_id(0);
835 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
836 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
837
838 // Load the values across the channels to compose the input tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100839 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
840 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
841 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
842 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
843 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
844 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
845 float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
846 float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
847
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100848#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
849 // Compute out00, out01, out02 and out03
850 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
851 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
852 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
853 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
854
855#if defined(HAS_BIAS)
856 // Add bias
857 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
858
Giorgio Arenaea55f912018-07-12 15:41:35 +0100859 float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100860
861 out00 += (float)b;
862 out01 += (float)b;
863 out02 += (float)b;
864 out03 += (float)b;
865#endif // defined(HAS_BIAS)
866
867 // Store the output tile
868#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
869 // Get output address
870 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
871 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
872
873 *(__global float *)(dst_ptr + offset.s0) = out00;
874 *(__global float *)(dst_ptr + offset.s1) = out01;
875 *(__global float *)(dst_ptr + offset.s2) = out02;
876 *(__global float *)(dst_ptr + offset.s3) = out03;
877#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
878 // Get output address
879 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
880
881 *(__global float *)(dst_ptr + 0 * dst_stride_y + offset) = out00;
882 *(__global float *)(dst_ptr + 1 * dst_stride_y + offset) = out01;
883 *(__global float *)(dst_ptr + 2 * dst_stride_y + offset) = out02;
884 *(__global float *)(dst_ptr + 3 * dst_stride_y + offset) = out03;
885#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
886
887#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
888
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100889 float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
890 float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
891 float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
892 float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
893 float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
894 float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
895 float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
896 float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
897
898 float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
899 float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
900 float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
901 float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
902 float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
903 float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
904 float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
905 float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
906
907 float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
908 float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
909 float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
910 float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
911 float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
912 float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
913 float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
914 float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
915
916 float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
917 float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
918 float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
919 float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
920 float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
921 float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
922 float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
923 float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
924
925 float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
926 float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
927 float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
928 float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
929 float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
930 float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
931 float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
932 float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
933
934 float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
935 float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
936 float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
937 float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
938 float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
939 float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
940 float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
941 float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
942
943 float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
944 float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
945 float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
946 float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
947 float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
948 float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
949 float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
950 float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
951
952 // Compute the 8x4 intermediate tensor
953 float4 comm_fact0, comm_fact1, comm_fact2;
954 float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
955
956 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
957 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
958 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
959 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
960 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
961 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
962 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
963 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
964
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100965 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100966 comm_fact0 = tmp_col1 + tmp_col2;
967 comm_fact1 = tmp_col3 + tmp_col4;
968 comm_fact2 = tmp_col5 + tmp_col6;
969
970 float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
971 float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
972
973 comm_fact0 = tmp_col1 - tmp_col2;
974 comm_fact1 = tmp_col3 - tmp_col4;
975 comm_fact2 = tmp_col5 - tmp_col6;
976
977 float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
978 float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
979
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100980#if defined(HAS_BIAS)
981 // Add bias
982 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
983
Giorgio Arenaea55f912018-07-12 15:41:35 +0100984 float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100985
986 out_col0 += (float4)b;
987 out_col1 += (float4)b;
988 out_col2 += (float4)b;
989 out_col3 += (float4)b;
990#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100991 // Get output address
992 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
993 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
994 int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
995
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100996 // Store the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100997 *(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0) = out_col0.s0;
998 *(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0) = out_col1.s0;
999 *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0) = out_col2.s0;
1000 *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0) = out_col3.s0;
Giorgio Arenad02eb452018-07-18 11:45:30 +01001001 *(__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1) = out_col0.s1;
1002 *(__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1) = out_col1.s1;
1003 *(__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1) = out_col2.s1;
1004 *(__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1) = out_col3.s1;
1005 *(__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2) = out_col0.s2;
1006 *(__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2) = out_col1.s2;
1007 *(__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2) = out_col2.s2;
1008 *(__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2) = out_col3.s2;
1009 *(__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3) = out_col0.s3;
1010 *(__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3) = out_col1.s3;
1011 *(__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3) = out_col2.s3;
1012 *(__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3) = out_col3.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001013#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001014}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001015
1016#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1017/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1018 *
1019 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1020 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1021 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1022 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1023 *
1024 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1025 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1026 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1027 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1028 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1029 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1030 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1031 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1032 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1033 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1034 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1035 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1036 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1037 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1038 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1039 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1040 */
1041__kernel void winograd_output_transform_2x1_3x1_nchw(
1042 TENSOR3D_DECLARATION(src),
1043 TENSOR3D_DECLARATION(dst)
1044#if defined(HAS_BIAS)
1045 ,
1046 VECTOR_DECLARATION(bias)
1047#endif // defined(HAS_BIAS)
1048)
1049{
1050 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1051 src_stride_x,
1052 src_step_x,
1053 src_stride_y,
1054 src_step_y,
1055 src_stride_z,
1056 src_step_z,
1057 src_offset_first_element_in_bytes,
1058 dst_ptr,
1059 dst_stride_x,
1060 dst_step_x,
1061 dst_stride_y,
1062 dst_step_y,
1063 dst_stride_z,
1064 dst_step_z,
1065 dst_offset_first_element_in_bytes
1066#if defined(HAS_BIAS)
1067 ,
1068 bias_ptr,
1069 bias_stride_x,
1070 bias_step_x,
1071 bias_offset_first_element_in_bytes
1072#endif // defined(HAS_BIAS)
1073 );
1074}
1075
1076/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1077 *
1078 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1079 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1080 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1081 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1082 *
1083 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1084 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1085 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1086 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1087 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1088 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1089 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1090 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1091 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1092 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1093 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1094 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1095 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1096 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1097 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1098 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1099 */
1100__kernel void winograd_output_transform_4x1_3x1_nchw(
1101 TENSOR3D_DECLARATION(src),
1102 TENSOR3D_DECLARATION(dst)
1103#if defined(HAS_BIAS)
1104 ,
1105 VECTOR_DECLARATION(bias)
1106#endif // defined(HAS_BIAS)
1107)
1108{
1109 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1110 src_stride_x,
1111 src_step_x,
1112 src_stride_y,
1113 src_step_y,
1114 src_stride_z,
1115 src_step_z,
1116 src_offset_first_element_in_bytes,
1117 dst_ptr,
1118 dst_stride_x,
1119 dst_step_x,
1120 dst_stride_y,
1121 dst_step_y,
1122 dst_stride_z,
1123 dst_step_z,
1124 dst_offset_first_element_in_bytes
1125#if defined(HAS_BIAS)
1126 ,
1127 bias_ptr,
1128 bias_stride_x,
1129 bias_step_x,
1130 bias_offset_first_element_in_bytes
1131#endif // defined(HAS_BIAS)
1132 );
1133}
1134
1135/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1136 *
1137 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1138 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1139 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1140 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1141 *
1142 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1143 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1144 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1145 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1146 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1147 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1148 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1149 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1150 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1151 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1152 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1153 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1154 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1155 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1156 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1157 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1158 */
1159__kernel void winograd_output_transform_4x1_5x1_nchw(
1160 TENSOR3D_DECLARATION(src),
1161 TENSOR3D_DECLARATION(dst)
1162#if defined(HAS_BIAS)
1163 ,
1164 VECTOR_DECLARATION(bias)
1165#endif // defined(HAS_BIAS)
1166)
1167{
1168 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1169 src_stride_x,
1170 src_step_x,
1171 src_stride_y,
1172 src_step_y,
1173 src_stride_z,
1174 src_step_z,
1175 src_offset_first_element_in_bytes,
1176 dst_ptr,
1177 dst_stride_x,
1178 dst_step_x,
1179 dst_stride_y,
1180 dst_step_y,
1181 dst_stride_z,
1182 dst_step_z,
1183 dst_offset_first_element_in_bytes
1184#if defined(HAS_BIAS)
1185 ,
1186 bias_ptr,
1187 bias_stride_x,
1188 bias_step_x,
1189 bias_offset_first_element_in_bytes
1190#endif // defined(HAS_BIAS)
1191 );
1192}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001193
1194/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1195 *
1196 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1197 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1198 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1199 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1200 *
1201 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1202 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1203 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1204 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1205 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1206 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1207 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1208 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1209 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1210 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1211 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1212 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1213 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1214 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1215 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1216 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1217 */
1218__kernel void winograd_output_transform_4x1_3x1_nhwc(
1219 TENSOR3D_DECLARATION(src),
1220 TENSOR3D_DECLARATION(dst),
1221#if defined(HAS_BIAS)
1222 VECTOR_DECLARATION(bias),
1223#endif // defined(HAS_BIAS)
1224 int dst_size)
1225{
1226 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1227 src_stride_x,
1228 src_step_x,
1229 src_stride_y,
1230 src_step_y,
1231 src_stride_z,
1232 src_step_z,
1233 src_offset_first_element_in_bytes,
1234 dst_ptr,
1235 dst_stride_x,
1236 dst_step_x,
1237 dst_stride_y,
1238 dst_step_y,
1239 dst_stride_z,
1240 dst_step_z,
1241 dst_offset_first_element_in_bytes,
1242#if defined(HAS_BIAS)
1243 bias_ptr,
1244 bias_stride_x,
1245 bias_step_x,
1246 bias_offset_first_element_in_bytes,
1247#endif // defined(HAS_BIAS)
1248 dst_size);
1249}
1250
1251/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1252 *
1253 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1254 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1255 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1256 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1257 *
1258 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1259 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1260 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1261 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1262 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1263 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1264 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1265 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1266 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1267 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1268 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1269 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1270 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1271 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1272 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1273 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1274 */
1275__kernel void winograd_output_transform_4x1_5x1_nhwc(
1276 TENSOR3D_DECLARATION(src),
1277 TENSOR3D_DECLARATION(dst),
1278#if defined(HAS_BIAS)
1279 VECTOR_DECLARATION(bias),
1280#endif // defined(HAS_BIAS)
1281 int dst_size)
1282{
1283 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1284 src_stride_x,
1285 src_step_x,
1286 src_stride_y,
1287 src_step_y,
1288 src_stride_z,
1289 src_step_z,
1290 src_offset_first_element_in_bytes,
1291 dst_ptr,
1292 dst_stride_x,
1293 dst_step_x,
1294 dst_stride_y,
1295 dst_step_y,
1296 dst_stride_z,
1297 dst_step_z,
1298 dst_offset_first_element_in_bytes,
1299#if defined(HAS_BIAS)
1300 bias_ptr,
1301 bias_stride_x,
1302 bias_step_x,
1303 bias_offset_first_element_in_bytes,
1304#endif // defined(HAS_BIAS)
1305 dst_size);
1306}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001307#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1308
1309#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1310/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1311 *
1312 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1313 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1314 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1315 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1316 *
1317 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1318 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1319 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1320 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1321 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1322 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1323 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1324 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1325 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1326 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1327 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1328 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1329 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1330 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1331 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1332 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1333 */
1334__kernel void winograd_output_transform_1x2_1x3_nchw(
1335 TENSOR3D_DECLARATION(src),
1336 TENSOR3D_DECLARATION(dst)
1337#if defined(HAS_BIAS)
1338 ,
1339 VECTOR_DECLARATION(bias)
1340#endif // defined(HAS_BIAS)
1341)
1342{
1343 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1344 src_stride_x,
1345 src_step_x,
1346 src_stride_y,
1347 src_step_y,
1348 src_stride_z,
1349 src_step_z,
1350 src_offset_first_element_in_bytes,
1351 dst_ptr,
1352 dst_stride_x,
1353 dst_step_x,
1354 dst_stride_y,
1355 dst_step_y,
1356 dst_stride_z,
1357 dst_step_z,
1358 dst_offset_first_element_in_bytes
1359#if defined(HAS_BIAS)
1360 ,
1361 bias_ptr,
1362 bias_stride_x,
1363 bias_step_x,
1364 bias_offset_first_element_in_bytes
1365#endif // defined(HAS_BIAS)
1366 );
1367}
1368
1369/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1370 *
1371 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1372 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1373 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1374 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1375 *
1376 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1377 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1378 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1379 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1380 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1381 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1382 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1383 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1384 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1385 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1386 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1387 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1388 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1389 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1390 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1391 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1392 */
1393__kernel void winograd_output_transform_1x4_1x3_nchw(
1394 TENSOR3D_DECLARATION(src),
1395 TENSOR3D_DECLARATION(dst)
1396#if defined(HAS_BIAS)
1397 ,
1398 VECTOR_DECLARATION(bias)
1399#endif // defined(HAS_BIAS)
1400)
1401{
1402 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1403 src_stride_x,
1404 src_step_x,
1405 src_stride_y,
1406 src_step_y,
1407 src_stride_z,
1408 src_step_z,
1409 src_offset_first_element_in_bytes,
1410 dst_ptr,
1411 dst_stride_x,
1412 dst_step_x,
1413 dst_stride_y,
1414 dst_step_y,
1415 dst_stride_z,
1416 dst_step_z,
1417 dst_offset_first_element_in_bytes
1418#if defined(HAS_BIAS)
1419 ,
1420 bias_ptr,
1421 bias_stride_x,
1422 bias_step_x,
1423 bias_offset_first_element_in_bytes
1424#endif // defined(HAS_BIAS)
1425 );
1426}
1427
1428/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1429 *
1430 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1431 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1432 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1433 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1434 *
1435 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1436 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1437 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1438 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1439 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1440 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1441 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1442 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1443 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1444 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1445 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1446 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1447 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1448 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1449 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1450 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1451 */
1452__kernel void winograd_output_transform_1x4_1x5_nchw(
1453 TENSOR3D_DECLARATION(src),
1454 TENSOR3D_DECLARATION(dst)
1455#if defined(HAS_BIAS)
1456 ,
1457 VECTOR_DECLARATION(bias)
1458#endif // defined(HAS_BIAS)
1459)
1460{
1461 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1462 src_stride_x,
1463 src_step_x,
1464 src_stride_y,
1465 src_step_y,
1466 src_stride_z,
1467 src_step_z,
1468 src_offset_first_element_in_bytes,
1469 dst_ptr,
1470 dst_stride_x,
1471 dst_step_x,
1472 dst_stride_y,
1473 dst_step_y,
1474 dst_stride_z,
1475 dst_step_z,
1476 dst_offset_first_element_in_bytes
1477#if defined(HAS_BIAS)
1478 ,
1479 bias_ptr,
1480 bias_stride_x,
1481 bias_step_x,
1482 bias_offset_first_element_in_bytes
1483#endif // defined(HAS_BIAS)
1484 );
1485}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001486
1487/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
1488 *
1489 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1490 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1491 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1492 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1493 *
1494 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1495 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1496 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1497 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1498 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1499 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1500 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1501 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1502 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1503 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1504 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1505 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1506 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1507 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1508 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1509 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1510 */
1511__kernel void winograd_output_transform_1x4_1x3_nhwc(
1512 TENSOR3D_DECLARATION(src),
1513 TENSOR3D_DECLARATION(dst),
1514#if defined(HAS_BIAS)
1515 VECTOR_DECLARATION(bias),
1516#endif // defined(HAS_BIAS)
1517 int dst_size)
1518{
1519 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1520 src_stride_x,
1521 src_step_x,
1522 src_stride_y,
1523 src_step_y,
1524 src_stride_z,
1525 src_step_z,
1526 src_offset_first_element_in_bytes,
1527 dst_ptr,
1528 dst_stride_x,
1529 dst_step_x,
1530 dst_stride_y,
1531 dst_step_y,
1532 dst_stride_z,
1533 dst_step_z,
1534 dst_offset_first_element_in_bytes,
1535#if defined(HAS_BIAS)
1536 bias_ptr,
1537 bias_stride_x,
1538 bias_step_x,
1539 bias_offset_first_element_in_bytes,
1540#endif // defined(HAS_BIAS)
1541 dst_size);
1542}
1543
1544/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1545 *
1546 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1547 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1548 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1549 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1550 *
1551 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1552 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1553 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1554 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1555 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1556 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1557 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1558 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1559 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1560 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1561 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1562 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1563 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1564 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1565 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1566 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1567 */
1568__kernel void winograd_output_transform_1x4_1x5_nhwc(
1569 TENSOR3D_DECLARATION(src),
1570 TENSOR3D_DECLARATION(dst),
1571#if defined(HAS_BIAS)
1572 VECTOR_DECLARATION(bias),
1573#endif // defined(HAS_BIAS)
1574 int dst_size)
1575{
1576 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1577 src_stride_x,
1578 src_step_x,
1579 src_stride_y,
1580 src_step_y,
1581 src_stride_z,
1582 src_step_z,
1583 src_offset_first_element_in_bytes,
1584 dst_ptr,
1585 dst_stride_x,
1586 dst_step_x,
1587 dst_stride_y,
1588 dst_step_y,
1589 dst_stride_z,
1590 dst_step_z,
1591 dst_offset_first_element_in_bytes,
1592#if defined(HAS_BIAS)
1593 bias_ptr,
1594 bias_stride_x,
1595 bias_step_x,
1596 bias_offset_first_element_in_bytes,
1597#endif // defined(HAS_BIAS)
1598 dst_size);
1599}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001600#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001601#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)