blob: 61f0f61db772b3730d8e865f961fcd39b1c5ae33 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
27/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
28 *
29 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
30 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
31 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
32 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
33 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
34 *
35 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
36 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
37 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
38 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
39 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
40 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
41 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
42 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
43 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
44 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
45 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
47 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
49 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
51 */
52__kernel void winograd_output_transform_2x2_3x3_nchw(
53 TENSOR3D_DECLARATION(src),
54 TENSOR3D_DECLARATION(dst)
55#if defined(HAS_BIAS)
56 ,
57 VECTOR_DECLARATION(bias)
58#endif // defined(HAS_BIAS)
59)
60{
61 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
62 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
63
64 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
65
66 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
67 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
68 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
69 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
70 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
71
72#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
73 // Compute the 2x1 or 1x2 output tile
74 // out00 = d00 + d01 + d02
75 // out01 = d01 - d02 - d03
76
77 float out00 = d00 + d01 + d02;
78 float out01 = d01 - d02 - d03;
79#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
80 float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
81 float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
82 float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
83 float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
84
85 float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
86 float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
87 float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
88 float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
89
90 float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
91 float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
92 float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
93 float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
94
95 // Compute the 2x2 output tile
96 float k0 = d01 + d11 + d21;
97 float k1 = d02 + d12 + d22;
98 float k2 = d11 - d21 - d31;
99 float k3 = d12 - d22 - d32;
100
101 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
102 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
103 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
104 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
105
106 float out00 = d10;
107 float out01 = -d13;
108 float out10 = d10;
109 float out11 = -d13;
110
111 out00 += d00 + d20 + k0 + k1;
112 out01 += k0 - k1 - (d03 + d23);
113 out10 += -d20 - d30 + k2 + k3;
114 out11 += k2 - k3 + d23 + d33;
115#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
116
117 int y_in = get_global_id(1);
118 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
119 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
120 int z_out = get_global_id(0);
121
122#if defined(HAS_BIAS)
123 // Add bias
124 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
125
126 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
127
128 out00 += (float)b;
129 out01 += (float)b;
130#endif // defined(HAS_BIAS)
131
132 // Get output address
133 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
134
135 // Store the output tile
136#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
137 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
138 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
139#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
140 vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
141#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
142
143#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
144#if defined(HAS_BIAS)
145 // Add bias
146 out10 += (float)b;
147 out11 += (float)b;
148#endif // defined(HAS_BIAS)
149
150 vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
151#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
152}
153
154/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
155 *
156 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
157 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
158 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
159 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
160 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
161 *
162 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
163 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
164 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
165 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
166 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
167 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
168 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
169 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
170 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
171 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
172 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
173 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
174 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
175 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
176 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
177 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
178 */
179__kernel void winograd_output_transform_4x4_3x3_nchw(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst)
182#if defined(HAS_BIAS)
183 ,
184 VECTOR_DECLARATION(bias)
185#endif // defined(HAS_BIAS)
186)
187{
188 // Each thread stores a 4x4/4x1 or 1x4 tile
189 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
190
191 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
192
193 // Load the values across the channels to compose the 6x6 or 6x1 tile
194 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
195 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
196 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
197 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
198 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
199 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
200
201#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
202 // Compute out00, out01, out02 and out03
203 float out00 = d00 + d01 + d02 + d03 + d04;
204 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
205 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
206 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
207#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
208 float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
209 float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
210 float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
211 float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
212 float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
213 float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
214
215 float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
216 float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
217 float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
218 float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
219 float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
220 float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
221
222 float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
223 float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
224 float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
225 float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
226 float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
227 float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
228
229 float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
230 float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
231 float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
232 float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
233 float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
234 float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
235
236 float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
237 float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
238 float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
239 float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
240 float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
241 float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
242
243 // Compute out00, out01, out02 and out03
244 float out00 = d01 + d21 + d41 + d11 + d31;
245 float out01 = d01 + d21 + d41 + d11 + d31;
246 float out02 = d01 + d21 + d41 + d11 + d31;
247 float out03 = d01 + d21 + d41 + d11 + d31;
248
249 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
250 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
251
252 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
253 out01 += k1 - d02 - d12 - d22 - d32 - d42;
254 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
255 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
256
257 // Compute out10, out11, out12 and out13
258 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
259 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
260 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
261 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
262
263 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
264 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
265
266 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
267 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
268 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
269 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
270
271 // Compute out20, out21, out22 and out23
272 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
273 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
274 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
275 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
276
277 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
278 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
279
280 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
281 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
282 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
283 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
284
285 // Compute out30, out31, out32 and out33
286 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
287 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
288 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
289 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
290
291 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
292 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
293
294 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
295 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
296 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
297 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
298#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
299
300 int y_in = get_global_id(1);
301 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
302 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
303 int z_out = get_global_id(0);
304
305#if defined(HAS_BIAS)
306 // Add bias
307 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
308
309 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
310
311 out00 += (float)b;
312 out01 += (float)b;
313 out02 += (float)b;
314 out03 += (float)b;
315#endif // defined(HAS_BIAS)
316
317 // Get output address
318 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
319
320 // Store the output tile
321#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
322 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
323 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
324 *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
325 *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
326#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
327 vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
328#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
329
330#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
331#if defined(HAS_BIAS)
332 // Add bias
333 out10 += (float)b;
334 out11 += (float)b;
335 out12 += (float)b;
336 out13 += (float)b;
337
338 out20 += (float)b;
339 out21 += (float)b;
340 out22 += (float)b;
341 out23 += (float)b;
342
343 out30 += (float)b;
344 out31 += (float)b;
345 out32 += (float)b;
346 out33 += (float)b;
347#endif // defined(HAS_BIAS)
348 vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
349 vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
350 vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
351#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
352}
353
Giorgio Arena149fdf32018-07-04 17:03:33 +0100354/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100355 *
356 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100357 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
358 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
359 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
360 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100361 *
362 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
363 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
364 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
365 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
366 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
367 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
368 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
369 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
370 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
371 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
372 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
373 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
374 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
375 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
376 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
377 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
378 * @param[in] dst_size Size of the destination tensor, minus the last padding
379 */
380__kernel void winograd_output_transform_4x4_3x3_nhwc(
381 TENSOR3D_DECLARATION(src),
382 TENSOR3D_DECLARATION(dst),
383#if defined(HAS_BIAS)
384 VECTOR_DECLARATION(bias),
385#endif // defined(HAS_BIAS)
386 int dst_size)
387{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100388 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100389 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
390
391 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
392
Giorgio Arena149fdf32018-07-04 17:03:33 +0100393 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100394 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
395 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
396 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
397 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
398 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
399 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
400
Giorgio Arena149fdf32018-07-04 17:03:33 +0100401#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
402 // Compute out00, out01, out02 and out03
403 float out00 = d00 + d01 + d02 + d03 + d04;
404 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
405 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
406 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
407#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
408
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100409 float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
410 float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
411 float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
412 float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
413 float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
414 float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
415
416 float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
417 float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
418 float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
419 float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
420 float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
421 float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
422
423 float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
424 float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
425 float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
426 float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
427 float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
428 float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
429
430 float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
431 float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
432 float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
433 float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
434 float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
435 float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
436
437 float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
438 float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
439 float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
440 float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
441 float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
442 float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
443
444 // Compute out00, out01, out02 and out03
445 float out00 = d01 + d21 + d41 + d11 + d31;
446 float out01 = d01 + d21 + d41 + d11 + d31;
447 float out02 = d01 + d21 + d41 + d11 + d31;
448 float out03 = d01 + d21 + d41 + d11 + d31;
449
450 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
451 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
452
453 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
454 out01 += k1 - d02 - d12 - d22 - d32 - d42;
455 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
456 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
457
458 // Compute out10, out11, out12 and out13
459 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
460 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
461 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
462 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
463
464 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
465 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
466
467 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
468 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
469 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
470 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
471
472 // Compute out20, out21, out22 and out23
473 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
474 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
475 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
476 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
477
478 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
479 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
480
481 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
482 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
483 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
484 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
485
486 // Compute out30, out31, out32 and out33
487 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
488 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
489 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
490 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
491
492 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
493 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
494
495 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
496 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
497 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
498 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100499#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100500
501 int y_in = get_global_id(1);
502 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100503 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
504 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100505
506#if defined(HAS_BIAS)
507 // Add bias
508 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
509
510 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
511
512 out00 += (float)b;
513 out01 += (float)b;
514 out02 += (float)b;
515 out03 += (float)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100516#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100517 out10 += (float)b;
518 out11 += (float)b;
519 out12 += (float)b;
520 out13 += (float)b;
521
522 out20 += (float)b;
523 out21 += (float)b;
524 out22 += (float)b;
525 out23 += (float)b;
526
527 out30 += (float)b;
528 out31 += (float)b;
529 out32 += (float)b;
530 out33 += (float)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100531#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100532
533#endif // defined(HAS_BIAS)
534
Giorgio Arena149fdf32018-07-04 17:03:33 +0100535#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
536 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
537 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
538
539 // Store the 1x4 output tile
540 *((__global float *)(dst_ptr + offset.s0)) = out00;
541 *((__global float *)(dst_ptr + offset.s1)) = out01;
542 *((__global float *)(dst_ptr + offset.s2)) = out02;
543 *((__global float *)(dst_ptr + offset.s3)) = out03;
544#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
545 // Store the 4x1 output tile
546 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
547 int4 mult_y = min(dst_size - offset, 1);
548
549 *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset)) = out00;
550 *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset)) = out01;
551 *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset)) = out02;
552 *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset)) = out03;
553#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100554 // Get output address
555 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
556 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
557 int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
558
559 // Store the 4x4 output tile
560 *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
561 *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
562 *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
563 *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
564 *((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
565 *((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
566 *((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
567 *((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
568 *((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
569 *((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
570 *((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
571 *((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
572 *((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
573 *((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
574 *((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
575 *((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100576
577#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100578}
579
Giorgio Arena149fdf32018-07-04 17:03:33 +0100580#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
581/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
582 *
583 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
584 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
585 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
586 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
587 *
588 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
589 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
590 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
591 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
592 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
593 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
594 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
595 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
596 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
597 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
598 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
599 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
600 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
601 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
602 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
603 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
604 */
605__kernel void winograd_output_transform_4x1_3x1_nhwc(
606 TENSOR3D_DECLARATION(src),
607 TENSOR3D_DECLARATION(dst),
608#if defined(HAS_BIAS)
609 VECTOR_DECLARATION(bias),
610#endif // defined(HAS_BIAS)
611 int dst_size)
612{
613 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
614 src_stride_x,
615 src_step_x,
616 src_stride_y,
617 src_step_y,
618 src_stride_z,
619 src_step_z,
620 src_offset_first_element_in_bytes,
621 dst_ptr,
622 dst_stride_x,
623 dst_step_x,
624 dst_stride_y,
625 dst_step_y,
626 dst_stride_z,
627 dst_step_z,
628 dst_offset_first_element_in_bytes,
629#if defined(HAS_BIAS)
630 bias_ptr,
631 bias_stride_x,
632 bias_step_x,
633 bias_offset_first_element_in_bytes,
634#endif // defined(HAS_BIAS)
635 dst_size);
636}
637#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
638
639#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
640/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
641 *
642 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
643 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
644 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
645 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
646 *
647 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
648 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
649 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
650 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
651 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
652 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
653 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
654 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
655 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
656 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
657 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
658 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
659 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
660 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
661 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
662 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
663 */
664__kernel void winograd_output_transform_1x4_1x3_nhwc(
665 TENSOR3D_DECLARATION(src),
666 TENSOR3D_DECLARATION(dst),
667#if defined(HAS_BIAS)
668 VECTOR_DECLARATION(bias),
669#endif // defined(HAS_BIAS)
670 int dst_size)
671{
672 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
673 src_stride_x,
674 src_step_x,
675 src_stride_y,
676 src_step_y,
677 src_stride_z,
678 src_step_z,
679 src_offset_first_element_in_bytes,
680 dst_ptr,
681 dst_stride_x,
682 dst_step_x,
683 dst_stride_y,
684 dst_step_y,
685 dst_stride_z,
686 dst_step_z,
687 dst_offset_first_element_in_bytes,
688#if defined(HAS_BIAS)
689 bias_ptr,
690 bias_stride_x,
691 bias_step_x,
692 bias_offset_first_element_in_bytes,
693#endif // defined(HAS_BIAS)
694 dst_size);
695}
696#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
697
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100698#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
699 ({ \
700 comm_fact.s0 = d1 + d2; \
701 comm_fact.s1 = d3 + d4; \
702 comm_fact.s2 = d5 + d6; \
703 \
704 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
705 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
706 \
707 comm_fact.s0 = d1 - d2; \
708 comm_fact.s1 = d3 - d4; \
709 comm_fact.s2 = d5 - d6; \
710 \
711 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
712 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
713 })
714
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100715/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100716 *
717 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100718 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
719 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
720 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
721 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100722 *
723 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
724 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
725 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
726 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
727 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
728 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
729 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
730 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
731 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
732 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
733 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
734 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
735 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
736 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
737 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
738 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
739 */
740__kernel void winograd_output_transform_4x4_5x5_nchw(
741 TENSOR3D_DECLARATION(src),
742 TENSOR3D_DECLARATION(dst)
743#if defined(HAS_BIAS)
744 ,
745 VECTOR_DECLARATION(bias)
746#endif // defined(HAS_BIAS)
747)
748{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100749 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100750 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
751
752 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
753
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100754 // Compute output address
755 int y_in = get_global_id(1);
756 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
757 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
758 int z_out = get_global_id(0);
759
760 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
761
762 // Load the values across the channels to compose the input tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100763 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
764 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
765 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
766 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
767 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
768 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
769 float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
770 float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
771
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100772#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
773 // Compute out00, out01, out02 and out03
774 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
775 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
776 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
777 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
778
779#if defined(HAS_BIAS)
780 // Add bias
781 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
782
783 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
784
785 out00 += (float)b;
786 out01 += (float)b;
787 out02 += (float)b;
788 out03 += (float)b;
789#endif // defined(HAS_BIAS)
790
791 // Store the output tile
792#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
793 *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
794 *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
795 *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
796 *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
797#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
798 vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));
799#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
800
801#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100802 float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
803 float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
804 float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
805 float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
806 float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
807 float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
808 float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
809 float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100810
811 float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
812 float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
813 float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
814 float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
815 float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
816 float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
817 float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
818 float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
819
820 float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
821 float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
822 float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
823 float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
824 float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
825 float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
826 float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
827 float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
828
829 float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
830 float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
831 float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
832 float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
833 float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
834 float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
835 float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
836 float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
837
838 float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
839 float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
840 float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
841 float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
842 float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
843 float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
844 float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
845 float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
846
847 float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
848 float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
849 float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
850 float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
851 float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
852 float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
853 float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
854 float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
855
856 float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
857 float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
858 float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
859 float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
860 float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
861 float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
862 float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
863 float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
864
865 // Compute the 8x4 intermediate tensor
866 float4 comm_fact0, comm_fact1, comm_fact2;
867 float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
868
869 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
870 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
871 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
872 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
873 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
874 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
875 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
876 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
877
878 // Compute the 4x4 output tile
879 comm_fact0 = tmp_col1 + tmp_col2;
880 comm_fact1 = tmp_col3 + tmp_col4;
881 comm_fact2 = tmp_col5 + tmp_col6;
882
883 float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
884 float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
885
886 comm_fact0 = tmp_col1 - tmp_col2;
887 comm_fact1 = tmp_col3 - tmp_col4;
888 comm_fact2 = tmp_col5 - tmp_col6;
889
890 float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
891 float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
892
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100893#if defined(HAS_BIAS)
894 // Add bias
895 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
896
897 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
898
899 out_col0 += (float4)b;
900 out_col1 += (float4)b;
901 out_col2 += (float4)b;
902 out_col3 += (float4)b;
903#endif // defined(HAS_BIAS)
904
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100905 // Store the output tile
906 vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
907 vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
908 vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
909 vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
910#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100911}
912
913/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data layout is NHWC
914 *
915 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
916 *
917 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
918 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
919 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
920 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
921 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
922 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
923 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
924 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
925 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
926 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
927 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
928 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
929 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
930 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
931 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
932 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
933 */
934__kernel void winograd_output_transform_4x4_5x5_nhwc(
935 TENSOR3D_DECLARATION(src),
936 TENSOR3D_DECLARATION(dst),
937#if defined(HAS_BIAS)
938 VECTOR_DECLARATION(bias),
939#endif // defined(HAS_BIAS)
940 int dst_size)
941{
942 // Each thread stores a 4x4 tile
943 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
944
945 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
946
947 // Load the values across the 64 channels to compose the 8x8 input tile
948 float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
949 float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
950 float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
951 float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
952 float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
953 float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
954 float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
955 float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
956
957 float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
958 float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
959 float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
960 float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
961 float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
962 float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
963 float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
964 float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
965
966 float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
967 float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
968 float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
969 float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
970 float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
971 float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
972 float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
973 float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
974
975 float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
976 float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
977 float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
978 float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
979 float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
980 float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
981 float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
982 float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
983
984 float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
985 float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
986 float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
987 float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
988 float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
989 float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
990 float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
991 float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
992
993 float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
994 float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
995 float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
996 float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
997 float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
998 float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
999 float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
1000 float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
1001
1002 float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
1003 float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
1004 float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
1005 float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
1006 float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
1007 float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
1008 float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
1009 float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
1010
1011 float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
1012 float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
1013 float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
1014 float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
1015 float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
1016 float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
1017 float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
1018 float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
1019
1020 // Compute the 8x4 intermediate tensor
1021 float4 comm_fact0, comm_fact1, comm_fact2;
1022 float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
1023
1024 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1025 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1026 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1027 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1028 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1029 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1030 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1031 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1032
1033 // Compute the 4x4 output tile
1034 comm_fact0 = tmp_col1 + tmp_col2;
1035 comm_fact1 = tmp_col3 + tmp_col4;
1036 comm_fact2 = tmp_col5 + tmp_col6;
1037
1038 float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1039 float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
1040
1041 comm_fact0 = tmp_col1 - tmp_col2;
1042 comm_fact1 = tmp_col3 - tmp_col4;
1043 comm_fact2 = tmp_col5 - tmp_col6;
1044
1045 float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1046 float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
1047
1048 int y_in = get_global_id(1);
1049 int x_out = get_global_id(0);
1050 int y_out = (y_in % NUM_TILES_X) * 4;
1051 int z_out = (y_in / NUM_TILES_X) * 4;
1052
1053#if defined(HAS_BIAS)
1054 // Add bias
1055 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1056
1057 float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
1058
1059 out_col0 += (float4)b;
1060 out_col1 += (float4)b;
1061 out_col2 += (float4)b;
1062 out_col3 += (float4)b;
1063#endif // defined(HAS_BIAS)
1064
1065 // Get output address
1066 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
1067 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1068 int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
1069
1070 // Store the 4x4 output tile
1071 *(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0) = out_col0.s0;
1072 *(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0) = out_col1.s0;
1073 *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0) = out_col2.s0;
1074 *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0) = out_col3.s0;
1075 *(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s1) = out_col0.s1;
1076 *(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s1) = out_col1.s1;
1077 *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s1) = out_col2.s1;
1078 *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s1) = out_col3.s1;
1079 *(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s2) = out_col0.s2;
1080 *(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s2) = out_col1.s2;
1081 *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s2) = out_col2.s2;
1082 *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s2) = out_col3.s2;
1083 *(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s3) = out_col0.s3;
1084 *(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s3) = out_col1.s3;
1085 *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s3) = out_col2.s3;
1086 *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s3) = out_col3.s3;
1087}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001088
1089#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1090/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1091 *
1092 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1093 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1094 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1095 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1096 *
1097 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1098 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1099 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1100 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1101 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1102 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1103 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1104 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1105 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1106 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1107 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1108 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1109 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1110 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1111 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1112 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1113 */
1114__kernel void winograd_output_transform_2x1_3x1_nchw(
1115 TENSOR3D_DECLARATION(src),
1116 TENSOR3D_DECLARATION(dst)
1117#if defined(HAS_BIAS)
1118 ,
1119 VECTOR_DECLARATION(bias)
1120#endif // defined(HAS_BIAS)
1121)
1122{
1123 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1124 src_stride_x,
1125 src_step_x,
1126 src_stride_y,
1127 src_step_y,
1128 src_stride_z,
1129 src_step_z,
1130 src_offset_first_element_in_bytes,
1131 dst_ptr,
1132 dst_stride_x,
1133 dst_step_x,
1134 dst_stride_y,
1135 dst_step_y,
1136 dst_stride_z,
1137 dst_step_z,
1138 dst_offset_first_element_in_bytes
1139#if defined(HAS_BIAS)
1140 ,
1141 bias_ptr,
1142 bias_stride_x,
1143 bias_step_x,
1144 bias_offset_first_element_in_bytes
1145#endif // defined(HAS_BIAS)
1146 );
1147}
1148
1149/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1150 *
1151 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1152 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1153 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1154 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1155 *
1156 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1157 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1158 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1159 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1160 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1161 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1162 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1163 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1164 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1165 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1166 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1167 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1168 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1169 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1170 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1171 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1172 */
1173__kernel void winograd_output_transform_4x1_3x1_nchw(
1174 TENSOR3D_DECLARATION(src),
1175 TENSOR3D_DECLARATION(dst)
1176#if defined(HAS_BIAS)
1177 ,
1178 VECTOR_DECLARATION(bias)
1179#endif // defined(HAS_BIAS)
1180)
1181{
1182 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1183 src_stride_x,
1184 src_step_x,
1185 src_stride_y,
1186 src_step_y,
1187 src_stride_z,
1188 src_step_z,
1189 src_offset_first_element_in_bytes,
1190 dst_ptr,
1191 dst_stride_x,
1192 dst_step_x,
1193 dst_stride_y,
1194 dst_step_y,
1195 dst_stride_z,
1196 dst_step_z,
1197 dst_offset_first_element_in_bytes
1198#if defined(HAS_BIAS)
1199 ,
1200 bias_ptr,
1201 bias_stride_x,
1202 bias_step_x,
1203 bias_offset_first_element_in_bytes
1204#endif // defined(HAS_BIAS)
1205 );
1206}
1207
1208/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1209 *
1210 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1211 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1212 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1213 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1214 *
1215 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1216 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1217 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1218 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1219 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1220 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1221 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1222 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1223 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1224 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1225 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1226 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1227 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1228 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1229 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1230 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1231 */
1232__kernel void winograd_output_transform_4x1_5x1_nchw(
1233 TENSOR3D_DECLARATION(src),
1234 TENSOR3D_DECLARATION(dst)
1235#if defined(HAS_BIAS)
1236 ,
1237 VECTOR_DECLARATION(bias)
1238#endif // defined(HAS_BIAS)
1239)
1240{
1241 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1242 src_stride_x,
1243 src_step_x,
1244 src_stride_y,
1245 src_step_y,
1246 src_stride_z,
1247 src_step_z,
1248 src_offset_first_element_in_bytes,
1249 dst_ptr,
1250 dst_stride_x,
1251 dst_step_x,
1252 dst_stride_y,
1253 dst_step_y,
1254 dst_stride_z,
1255 dst_step_z,
1256 dst_offset_first_element_in_bytes
1257#if defined(HAS_BIAS)
1258 ,
1259 bias_ptr,
1260 bias_stride_x,
1261 bias_step_x,
1262 bias_offset_first_element_in_bytes
1263#endif // defined(HAS_BIAS)
1264 );
1265}
1266#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1267
1268#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1269/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1270 *
1271 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1272 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1273 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1274 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1275 *
1276 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1277 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1278 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1279 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1280 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1281 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1282 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1283 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1284 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1285 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1286 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1287 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1288 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1289 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1290 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1291 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1292 */
1293__kernel void winograd_output_transform_1x2_1x3_nchw(
1294 TENSOR3D_DECLARATION(src),
1295 TENSOR3D_DECLARATION(dst)
1296#if defined(HAS_BIAS)
1297 ,
1298 VECTOR_DECLARATION(bias)
1299#endif // defined(HAS_BIAS)
1300)
1301{
1302 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1303 src_stride_x,
1304 src_step_x,
1305 src_stride_y,
1306 src_step_y,
1307 src_stride_z,
1308 src_step_z,
1309 src_offset_first_element_in_bytes,
1310 dst_ptr,
1311 dst_stride_x,
1312 dst_step_x,
1313 dst_stride_y,
1314 dst_step_y,
1315 dst_stride_z,
1316 dst_step_z,
1317 dst_offset_first_element_in_bytes
1318#if defined(HAS_BIAS)
1319 ,
1320 bias_ptr,
1321 bias_stride_x,
1322 bias_step_x,
1323 bias_offset_first_element_in_bytes
1324#endif // defined(HAS_BIAS)
1325 );
1326}
1327
1328/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1329 *
1330 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1331 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1332 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1333 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1334 *
1335 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1336 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1337 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1338 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1339 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1340 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1341 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1342 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1343 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1344 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1345 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1346 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1347 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1348 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1349 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1350 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1351 */
1352__kernel void winograd_output_transform_1x4_1x3_nchw(
1353 TENSOR3D_DECLARATION(src),
1354 TENSOR3D_DECLARATION(dst)
1355#if defined(HAS_BIAS)
1356 ,
1357 VECTOR_DECLARATION(bias)
1358#endif // defined(HAS_BIAS)
1359)
1360{
1361 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1362 src_stride_x,
1363 src_step_x,
1364 src_stride_y,
1365 src_step_y,
1366 src_stride_z,
1367 src_step_z,
1368 src_offset_first_element_in_bytes,
1369 dst_ptr,
1370 dst_stride_x,
1371 dst_step_x,
1372 dst_stride_y,
1373 dst_step_y,
1374 dst_stride_z,
1375 dst_step_z,
1376 dst_offset_first_element_in_bytes
1377#if defined(HAS_BIAS)
1378 ,
1379 bias_ptr,
1380 bias_stride_x,
1381 bias_step_x,
1382 bias_offset_first_element_in_bytes
1383#endif // defined(HAS_BIAS)
1384 );
1385}
1386
1387/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1388 *
1389 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1390 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1391 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1392 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1393 *
1394 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
1395 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1396 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1397 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1398 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1399 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1400 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1401 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1402 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1403 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1404 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1405 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1406 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1407 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1408 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1409 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1410 */
1411__kernel void winograd_output_transform_1x4_1x5_nchw(
1412 TENSOR3D_DECLARATION(src),
1413 TENSOR3D_DECLARATION(dst)
1414#if defined(HAS_BIAS)
1415 ,
1416 VECTOR_DECLARATION(bias)
1417#endif // defined(HAS_BIAS)
1418)
1419{
1420 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1421 src_stride_x,
1422 src_step_x,
1423 src_stride_y,
1424 src_step_y,
1425 src_stride_z,
1426 src_step_z,
1427 src_offset_first_element_in_bytes,
1428 dst_ptr,
1429 dst_stride_x,
1430 dst_step_x,
1431 dst_stride_y,
1432 dst_step_y,
1433 dst_stride_z,
1434 dst_step_z,
1435 dst_offset_first_element_in_bytes
1436#if defined(HAS_BIAS)
1437 ,
1438 bias_ptr,
1439 bias_stride_x,
1440 bias_step_x,
1441 bias_offset_first_element_in_bytes
1442#endif // defined(HAS_BIAS)
1443 );
1444}
1445#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001446#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)