blob: 9be51f27ecbd28fe05be1163810869ff68fdbac7 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
27/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
28 *
29 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
30 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
31 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
32 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
33 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010034 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010035 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010036 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010037 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
38 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
39 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
40 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
41 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
42 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
43 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
44 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
45 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
46 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
47 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
48 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
49 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
50 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
51 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
52 */
53__kernel void winograd_output_transform_2x2_3x3_nchw(
54 TENSOR3D_DECLARATION(src),
55 TENSOR3D_DECLARATION(dst)
56#if defined(HAS_BIAS)
57 ,
58 VECTOR_DECLARATION(bias)
59#endif // defined(HAS_BIAS)
60)
61{
62 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
63 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
64
65 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
66
67 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010068 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
69 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
70 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
71 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010072
73#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
74 // Compute the 2x1 or 1x2 output tile
75 // out00 = d00 + d01 + d02
76 // out01 = d01 - d02 - d03
77
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010078 DATA_TYPE out00 = d00 + d01 + d02;
79 DATA_TYPE out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010081 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
82 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
83 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
84 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010085
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010086 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
87 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
88 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
89 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010090
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010091 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
92 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
93 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
94 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010095
96 // Compute the 2x2 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010097 DATA_TYPE k0 = d01 + d11 + d21;
98 DATA_TYPE k1 = d02 + d12 + d22;
99 DATA_TYPE k2 = d11 - d21 - d31;
100 DATA_TYPE k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100101
102 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
103 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
104 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
105 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
106
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100107 DATA_TYPE out00 = d10;
108 DATA_TYPE out01 = -d13;
109 DATA_TYPE out10 = d10;
110 DATA_TYPE out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100111
112 out00 += d00 + d20 + k0 + k1;
113 out01 += k0 - k1 - (d03 + d23);
114 out10 += -d20 - d30 + k2 + k3;
115 out11 += k2 - k3 + d23 + d33;
116#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
117
118 int y_in = get_global_id(1);
119 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
120 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
121 int z_out = get_global_id(0);
122
123#if defined(HAS_BIAS)
124 // Add bias
125 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
126
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100127 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100128
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100129 out00 += (DATA_TYPE)b;
130 out01 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100131#endif // defined(HAS_BIAS)
132
133 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100134 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100135
136 // Store the output tile
137#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100138 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
139 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100141 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100142#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
143
144#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
145#if defined(HAS_BIAS)
146 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100147 out10 += (DATA_TYPE)b;
148 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100151 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out10, out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100152#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
153}
154
155/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
156 *
157 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
158 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
159 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
160 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
161 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100162 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100163 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100164 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100165 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
166 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
167 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
168 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
169 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
170 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
171 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
172 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
173 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
174 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
175 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
176 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
177 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
178 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
179 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
180 */
181__kernel void winograd_output_transform_4x4_3x3_nchw(
182 TENSOR3D_DECLARATION(src),
183 TENSOR3D_DECLARATION(dst)
184#if defined(HAS_BIAS)
185 ,
186 VECTOR_DECLARATION(bias)
187#endif // defined(HAS_BIAS)
188)
189{
190 // Each thread stores a 4x4/4x1 or 1x4 tile
191 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
192
193 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
194
195 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100196 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
197 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
198 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
199 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
200 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
201 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100202
203#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
204 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100205 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;
206 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
207 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
208 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100209#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100210 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
211 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
212 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
213 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
214 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
215 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100216
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100217 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
218 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
219 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
220 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
221 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
222 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100223
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100224 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
225 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
226 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
227 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
228 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
229 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100230
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100231 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
232 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
233 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
234 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
235 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
236 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100237
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100238 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
239 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
240 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
241 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
242 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
243 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100244
245 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100246 DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;
247 DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;
248 DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;
249 DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100250
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100251 DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
252 DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100253
254 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
255 out01 += k1 - d02 - d12 - d22 - d32 - d42;
256 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
257 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
258
259 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100260 DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
261 DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
262 DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
263 DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100264
265 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
266 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
267
268 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
269 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
270 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
271 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
272
273 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100274 DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
275 DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
276 DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
277 DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100278
279 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
280 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
281
282 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
283 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
284 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
285 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
286
287 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100288 DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
289 DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
290 DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
291 DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100292
293 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
294 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
295
296 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
297 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
298 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
299 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
300#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
301
302 int y_in = get_global_id(1);
303 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
304 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
305 int z_out = get_global_id(0);
306
307#if defined(HAS_BIAS)
308 // Add bias
309 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
310
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100311 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100312
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100313 out00 += (DATA_TYPE)b;
314 out01 += (DATA_TYPE)b;
315 out02 += (DATA_TYPE)b;
316 out03 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100317#endif // defined(HAS_BIAS)
318
319 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100320 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100321
322 // Store the output tile
323#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100324 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
325 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
326 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
327 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100328#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100329 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100330#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
331
332#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
333#if defined(HAS_BIAS)
334 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100335 out10 += (DATA_TYPE)b;
336 out11 += (DATA_TYPE)b;
337 out12 += (DATA_TYPE)b;
338 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100339
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100340 out20 += (DATA_TYPE)b;
341 out21 += (DATA_TYPE)b;
342 out22 += (DATA_TYPE)b;
343 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100344
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100345 out30 += (DATA_TYPE)b;
346 out31 += (DATA_TYPE)b;
347 out32 += (DATA_TYPE)b;
348 out33 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100349#endif // defined(HAS_BIAS)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100350 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out10, out11, out12, out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
351 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out20, out21, out22, out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
352 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out30, out31, out32, out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100353#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
354}
355
Giorgio Arena149fdf32018-07-04 17:03:33 +0100356/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100357 *
358 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100359 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
360 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
361 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
362 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100363 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100364 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100365 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100366 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
367 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
368 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
369 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
370 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
371 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
372 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
373 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
374 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
375 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
376 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
377 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
378 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
379 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
380 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
381 * @param[in] dst_size Size of the destination tensor, minus the last padding
382 */
383__kernel void winograd_output_transform_4x4_3x3_nhwc(
384 TENSOR3D_DECLARATION(src),
385 TENSOR3D_DECLARATION(dst),
386#if defined(HAS_BIAS)
387 VECTOR_DECLARATION(bias),
388#endif // defined(HAS_BIAS)
389 int dst_size)
390{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100391 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100392 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
393
394 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
395
Giorgio Arena149fdf32018-07-04 17:03:33 +0100396 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100397 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
398 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
399 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
400 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
401 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
402 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100403
Giorgio Arena149fdf32018-07-04 17:03:33 +0100404#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
405 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100406 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;
407 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
408 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
409 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100410#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
411
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100412 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
413 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
414 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
415 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
416 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
417 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100418
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100419 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
420 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
421 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
422 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
423 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
424 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100425
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100426 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
427 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
428 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
429 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
430 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
431 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100432
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100433 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
434 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
435 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
436 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
437 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
438 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100439
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100440 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
441 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
442 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
443 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
444 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
445 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100446
447 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100448 DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;
449 DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;
450 DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;
451 DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100452
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100453 DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
454 DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100455
456 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
457 out01 += k1 - d02 - d12 - d22 - d32 - d42;
458 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
459 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
460
461 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100462 DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
463 DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
464 DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
465 DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100466
467 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
468 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
469
470 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
471 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
472 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
473 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
474
475 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100476 DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
477 DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
478 DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
479 DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100480
481 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
482 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
483
484 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
485 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
486 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
487 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
488
489 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100490 DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
491 DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
492 DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
493 DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100494
495 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
496 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
497
498 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
499 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
500 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
501 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100502#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100503
504 int y_in = get_global_id(1);
505 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100506 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
507 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100508
509#if defined(HAS_BIAS)
510 // Add bias
511 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
512
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100513 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100514
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100515 out00 += (DATA_TYPE)b;
516 out01 += (DATA_TYPE)b;
517 out02 += (DATA_TYPE)b;
518 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100519#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100520 out10 += (DATA_TYPE)b;
521 out11 += (DATA_TYPE)b;
522 out12 += (DATA_TYPE)b;
523 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100524
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100525 out20 += (DATA_TYPE)b;
526 out21 += (DATA_TYPE)b;
527 out22 += (DATA_TYPE)b;
528 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100529
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100530 out30 += (DATA_TYPE)b;
531 out31 += (DATA_TYPE)b;
532 out32 += (DATA_TYPE)b;
533 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100534#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100535
536#endif // defined(HAS_BIAS)
537
Giorgio Arena149fdf32018-07-04 17:03:33 +0100538#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100539 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100540 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100541
542 // Store the 1x4 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100543 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out00;
544 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out01;
545 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out02;
546 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out03;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100547#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
548 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100549 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100550 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100551
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100552 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;
553 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;
554 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;
555 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100556#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100557 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100558 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100559 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100560 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100561
562 // Store the 4x4 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100563 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
564 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
565 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
566 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
567 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
568 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
569 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
570 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
571 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
572 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
573 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
574 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
575 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
576 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
577 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
578 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100579
580#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100581}
582
583#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
584 ({ \
585 comm_fact.s0 = d1 + d2; \
586 comm_fact.s1 = d3 + d4; \
587 comm_fact.s2 = d5 + d6; \
588 \
589 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
590 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
591 \
592 comm_fact.s0 = d1 - d2; \
593 comm_fact.s1 = d3 - d4; \
594 comm_fact.s2 = d5 - d6; \
595 \
596 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
597 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
598 })
599
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100600/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100601 *
602 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100603 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
604 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
605 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
606 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100607 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100608 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100609 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100610 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
611 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
613 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
614 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
615 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
616 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
617 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
618 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
619 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
620 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
621 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
622 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
623 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
624 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
625 */
626__kernel void winograd_output_transform_4x4_5x5_nchw(
627 TENSOR3D_DECLARATION(src),
628 TENSOR3D_DECLARATION(dst)
629#if defined(HAS_BIAS)
630 ,
631 VECTOR_DECLARATION(bias)
632#endif // defined(HAS_BIAS)
633)
634{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100635 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100636 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
637
638 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
639
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100640 // Compute output address
641 int y_in = get_global_id(1);
642 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
643 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
644 int z_out = get_global_id(0);
645
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100646 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100647
648 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100649 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
650 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
651 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
652 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
653 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
654 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
655 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
656 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100657
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100658#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
659 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100660 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
661 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
662 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
663 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100664
665#if defined(HAS_BIAS)
666 // Add bias
667 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
668
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100669 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100670
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100671 out00 += (DATA_TYPE)b;
672 out01 += (DATA_TYPE)b;
673 out02 += (DATA_TYPE)b;
674 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100675#endif // defined(HAS_BIAS)
676
677 // Store the output tile
678#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100679 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
680 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
681 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
682 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100683#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100684 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100685#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
686
687#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100688 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
689 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
690 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
691 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
692 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
693 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
694 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
695 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100696
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100697 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
698 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
699 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
700 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
701 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
702 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
703 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
704 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100705
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100706 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
707 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
708 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
709 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
710 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
711 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
712 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
713 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100714
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100715 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
716 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
717 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
718 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
719 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
720 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
721 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
722 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100723
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
725 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
726 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
727 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
728 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
729 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
730 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
731 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100732
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100733 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
734 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
735 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
736 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
737 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
738 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
739 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
740 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100741
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100742 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
743 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
744 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
745 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
746 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
747 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
748 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
749 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100750
751 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100752 VEC_DATA_TYPE(DATA_TYPE, 4)
753 comm_fact0, comm_fact1, comm_fact2;
754 VEC_DATA_TYPE(DATA_TYPE, 4)
755 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100756
757 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
758 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
759 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
760 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
761 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
762 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
763 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
764 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
765
766 // Compute the 4x4 output tile
767 comm_fact0 = tmp_col1 + tmp_col2;
768 comm_fact1 = tmp_col3 + tmp_col4;
769 comm_fact2 = tmp_col5 + tmp_col6;
770
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100771 VEC_DATA_TYPE(DATA_TYPE, 4)
772 out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;
773 VEC_DATA_TYPE(DATA_TYPE, 4)
774 out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100775
776 comm_fact0 = tmp_col1 - tmp_col2;
777 comm_fact1 = tmp_col3 - tmp_col4;
778 comm_fact2 = tmp_col5 - tmp_col6;
779
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100780 VEC_DATA_TYPE(DATA_TYPE, 4)
781 out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;
782 VEC_DATA_TYPE(DATA_TYPE, 4)
783 out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100784
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100785#if defined(HAS_BIAS)
786 // Add bias
787 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
788
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100789 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100790
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100791 out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
792 out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
793 out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
794 out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100795#endif // defined(HAS_BIAS)
796
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100797 // Store the output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100798 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
799 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
800 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
801 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100802#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100803}
804
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100805/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100806 *
807 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100808 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
809 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
810 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
811 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100812 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100813 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100814 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100815 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
816 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
817 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
818 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
819 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
820 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
821 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
822 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
823 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
824 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
825 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
826 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
827 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
828 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
829 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
830 */
831__kernel void winograd_output_transform_4x4_5x5_nhwc(
832 TENSOR3D_DECLARATION(src),
833 TENSOR3D_DECLARATION(dst),
834#if defined(HAS_BIAS)
835 VECTOR_DECLARATION(bias),
836#endif // defined(HAS_BIAS)
837 int dst_size)
838{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100839 // Each thread stores a 4x4/4x1 or 1x4 tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100840 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
841
842 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
843
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100844 int y_in = get_global_id(1);
845 int x_out = get_global_id(0);
846 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
847 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
848
849 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100850 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
851 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
852 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
853 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
854 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
855 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
856 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
857 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100858
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100859#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
860 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100861 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
862 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
863 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
864 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100865
866#if defined(HAS_BIAS)
867 // Add bias
868 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
869
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100870 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100871
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100872 out00 += (DATA_TYPE)b;
873 out01 += (DATA_TYPE)b;
874 out02 += (DATA_TYPE)b;
875 out03 += (DATA_TYPE)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100876#endif // defined(HAS_BIAS)
877
878 // Store the output tile
879#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
880 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100881 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100882 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100883
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100884 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out00;
885 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out01;
886 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out02;
887 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out03;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100888#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
889 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100890 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100891
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100892 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out00;
893 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out01;
894 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out02;
895 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out03;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100896#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
897
898#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
899
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100900 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
901 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
902 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
903 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
904 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
905 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
906 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
907 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100908
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100909 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
910 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
911 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
912 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
913 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
914 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
915 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
916 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100917
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100918 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
919 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
920 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
921 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
922 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
923 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
924 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
925 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100926
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100927 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
928 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
929 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
930 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
931 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
932 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
933 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
934 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100935
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100936 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
937 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
938 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
939 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
940 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
941 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
942 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
943 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100944
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100945 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
946 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
947 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
948 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
949 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
950 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
951 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
952 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100953
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100954 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
955 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
956 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
957 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
958 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
959 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
960 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
961 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100962
963 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100964 VEC_DATA_TYPE(DATA_TYPE, 4)
965 comm_fact0, comm_fact1, comm_fact2;
966 VEC_DATA_TYPE(DATA_TYPE, 4)
967 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100968
969 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
970 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
971 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
972 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
973 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
974 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
975 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
976 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
977
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100978 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100979 comm_fact0 = tmp_col1 + tmp_col2;
980 comm_fact1 = tmp_col3 + tmp_col4;
981 comm_fact2 = tmp_col5 + tmp_col6;
982
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100983 VEC_DATA_TYPE(DATA_TYPE, 4)
984 out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;
985 VEC_DATA_TYPE(DATA_TYPE, 4)
986 out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100987
988 comm_fact0 = tmp_col1 - tmp_col2;
989 comm_fact1 = tmp_col3 - tmp_col4;
990 comm_fact2 = tmp_col5 - tmp_col6;
991
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100992 VEC_DATA_TYPE(DATA_TYPE, 4)
993 out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;
994 VEC_DATA_TYPE(DATA_TYPE, 4)
995 out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100996
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100997#if defined(HAS_BIAS)
998 // Add bias
999 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1000
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001001 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001002
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001003 out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1004 out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1005 out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1006 out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001007#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001008 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001009 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001010 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1011 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001012
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001013 // Store the output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001014 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;
1015 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;
1016 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;
1017 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;
1018 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;
1019 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;
1020 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;
1021 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;
1022 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;
1023 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;
1024 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;
1025 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;
1026 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;
1027 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;
1028 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;
1029 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001030#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001031}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001032
1033#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1034/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1035 *
1036 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1037 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1038 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1039 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001040 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1044 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1045 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1046 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1047 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1048 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1049 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1050 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1051 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1052 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1053 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1054 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1055 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1056 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1057 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1058 */
1059__kernel void winograd_output_transform_2x1_3x1_nchw(
1060 TENSOR3D_DECLARATION(src),
1061 TENSOR3D_DECLARATION(dst)
1062#if defined(HAS_BIAS)
1063 ,
1064 VECTOR_DECLARATION(bias)
1065#endif // defined(HAS_BIAS)
1066)
1067{
1068 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1069 src_stride_x,
1070 src_step_x,
1071 src_stride_y,
1072 src_step_y,
1073 src_stride_z,
1074 src_step_z,
1075 src_offset_first_element_in_bytes,
1076 dst_ptr,
1077 dst_stride_x,
1078 dst_step_x,
1079 dst_stride_y,
1080 dst_step_y,
1081 dst_stride_z,
1082 dst_step_z,
1083 dst_offset_first_element_in_bytes
1084#if defined(HAS_BIAS)
1085 ,
1086 bias_ptr,
1087 bias_stride_x,
1088 bias_step_x,
1089 bias_offset_first_element_in_bytes
1090#endif // defined(HAS_BIAS)
1091 );
1092}
1093
1094/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1095 *
1096 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1097 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1098 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1099 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001100 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001101 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001102 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001103 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1104 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1105 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1106 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1107 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1108 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1109 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1110 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1111 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1112 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1113 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1114 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1115 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1116 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1117 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1118 */
1119__kernel void winograd_output_transform_4x1_3x1_nchw(
1120 TENSOR3D_DECLARATION(src),
1121 TENSOR3D_DECLARATION(dst)
1122#if defined(HAS_BIAS)
1123 ,
1124 VECTOR_DECLARATION(bias)
1125#endif // defined(HAS_BIAS)
1126)
1127{
1128 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1129 src_stride_x,
1130 src_step_x,
1131 src_stride_y,
1132 src_step_y,
1133 src_stride_z,
1134 src_step_z,
1135 src_offset_first_element_in_bytes,
1136 dst_ptr,
1137 dst_stride_x,
1138 dst_step_x,
1139 dst_stride_y,
1140 dst_step_y,
1141 dst_stride_z,
1142 dst_step_z,
1143 dst_offset_first_element_in_bytes
1144#if defined(HAS_BIAS)
1145 ,
1146 bias_ptr,
1147 bias_stride_x,
1148 bias_step_x,
1149 bias_offset_first_element_in_bytes
1150#endif // defined(HAS_BIAS)
1151 );
1152}
1153
1154/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1155 *
1156 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1157 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1158 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1159 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001160 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001161 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001162 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001163 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1164 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1165 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1166 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1167 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1168 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1169 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1170 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1171 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1172 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1173 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1174 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1175 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1176 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1177 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1178 */
1179__kernel void winograd_output_transform_4x1_5x1_nchw(
1180 TENSOR3D_DECLARATION(src),
1181 TENSOR3D_DECLARATION(dst)
1182#if defined(HAS_BIAS)
1183 ,
1184 VECTOR_DECLARATION(bias)
1185#endif // defined(HAS_BIAS)
1186)
1187{
1188 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1189 src_stride_x,
1190 src_step_x,
1191 src_stride_y,
1192 src_step_y,
1193 src_stride_z,
1194 src_step_z,
1195 src_offset_first_element_in_bytes,
1196 dst_ptr,
1197 dst_stride_x,
1198 dst_step_x,
1199 dst_stride_y,
1200 dst_step_y,
1201 dst_stride_z,
1202 dst_step_z,
1203 dst_offset_first_element_in_bytes
1204#if defined(HAS_BIAS)
1205 ,
1206 bias_ptr,
1207 bias_stride_x,
1208 bias_step_x,
1209 bias_offset_first_element_in_bytes
1210#endif // defined(HAS_BIAS)
1211 );
1212}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001213
1214/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1215 *
1216 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1217 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1218 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1219 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001220 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001221 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001222 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001223 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1224 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1225 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1226 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1227 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1228 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1229 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1230 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1231 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1232 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1233 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1234 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1235 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1236 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1237 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1238 */
1239__kernel void winograd_output_transform_4x1_3x1_nhwc(
1240 TENSOR3D_DECLARATION(src),
1241 TENSOR3D_DECLARATION(dst),
1242#if defined(HAS_BIAS)
1243 VECTOR_DECLARATION(bias),
1244#endif // defined(HAS_BIAS)
1245 int dst_size)
1246{
1247 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1248 src_stride_x,
1249 src_step_x,
1250 src_stride_y,
1251 src_step_y,
1252 src_stride_z,
1253 src_step_z,
1254 src_offset_first_element_in_bytes,
1255 dst_ptr,
1256 dst_stride_x,
1257 dst_step_x,
1258 dst_stride_y,
1259 dst_step_y,
1260 dst_stride_z,
1261 dst_step_z,
1262 dst_offset_first_element_in_bytes,
1263#if defined(HAS_BIAS)
1264 bias_ptr,
1265 bias_stride_x,
1266 bias_step_x,
1267 bias_offset_first_element_in_bytes,
1268#endif // defined(HAS_BIAS)
1269 dst_size);
1270}
1271
1272/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1273 *
1274 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1275 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1276 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1277 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001278 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001279 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001280 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001281 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1282 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1283 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1284 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1285 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1286 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1287 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1288 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1289 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1290 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1291 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1292 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1293 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1294 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1295 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1296 */
1297__kernel void winograd_output_transform_4x1_5x1_nhwc(
1298 TENSOR3D_DECLARATION(src),
1299 TENSOR3D_DECLARATION(dst),
1300#if defined(HAS_BIAS)
1301 VECTOR_DECLARATION(bias),
1302#endif // defined(HAS_BIAS)
1303 int dst_size)
1304{
1305 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1306 src_stride_x,
1307 src_step_x,
1308 src_stride_y,
1309 src_step_y,
1310 src_stride_z,
1311 src_step_z,
1312 src_offset_first_element_in_bytes,
1313 dst_ptr,
1314 dst_stride_x,
1315 dst_step_x,
1316 dst_stride_y,
1317 dst_step_y,
1318 dst_stride_z,
1319 dst_step_z,
1320 dst_offset_first_element_in_bytes,
1321#if defined(HAS_BIAS)
1322 bias_ptr,
1323 bias_stride_x,
1324 bias_step_x,
1325 bias_offset_first_element_in_bytes,
1326#endif // defined(HAS_BIAS)
1327 dst_size);
1328}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001329#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1330
1331#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1332/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1333 *
1334 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1335 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1336 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1337 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001338 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001339 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001340 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001341 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1342 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1343 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1344 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1345 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1346 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1347 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1348 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1349 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1350 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1351 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1352 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1353 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1354 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1355 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1356 */
1357__kernel void winograd_output_transform_1x2_1x3_nchw(
1358 TENSOR3D_DECLARATION(src),
1359 TENSOR3D_DECLARATION(dst)
1360#if defined(HAS_BIAS)
1361 ,
1362 VECTOR_DECLARATION(bias)
1363#endif // defined(HAS_BIAS)
1364)
1365{
1366 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1367 src_stride_x,
1368 src_step_x,
1369 src_stride_y,
1370 src_step_y,
1371 src_stride_z,
1372 src_step_z,
1373 src_offset_first_element_in_bytes,
1374 dst_ptr,
1375 dst_stride_x,
1376 dst_step_x,
1377 dst_stride_y,
1378 dst_step_y,
1379 dst_stride_z,
1380 dst_step_z,
1381 dst_offset_first_element_in_bytes
1382#if defined(HAS_BIAS)
1383 ,
1384 bias_ptr,
1385 bias_stride_x,
1386 bias_step_x,
1387 bias_offset_first_element_in_bytes
1388#endif // defined(HAS_BIAS)
1389 );
1390}
1391
1392/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1393 *
1394 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1395 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1396 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1397 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001398 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001399 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001400 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001401 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1402 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1403 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1404 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1405 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1406 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1407 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1408 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1409 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1410 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1411 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1412 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1413 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1414 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1415 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1416 */
1417__kernel void winograd_output_transform_1x4_1x3_nchw(
1418 TENSOR3D_DECLARATION(src),
1419 TENSOR3D_DECLARATION(dst)
1420#if defined(HAS_BIAS)
1421 ,
1422 VECTOR_DECLARATION(bias)
1423#endif // defined(HAS_BIAS)
1424)
1425{
1426 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1427 src_stride_x,
1428 src_step_x,
1429 src_stride_y,
1430 src_step_y,
1431 src_stride_z,
1432 src_step_z,
1433 src_offset_first_element_in_bytes,
1434 dst_ptr,
1435 dst_stride_x,
1436 dst_step_x,
1437 dst_stride_y,
1438 dst_step_y,
1439 dst_stride_z,
1440 dst_step_z,
1441 dst_offset_first_element_in_bytes
1442#if defined(HAS_BIAS)
1443 ,
1444 bias_ptr,
1445 bias_stride_x,
1446 bias_step_x,
1447 bias_offset_first_element_in_bytes
1448#endif // defined(HAS_BIAS)
1449 );
1450}
1451
1452/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1453 *
1454 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1455 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1456 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1457 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001458 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001459 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001460 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001461 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1462 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1463 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1464 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1465 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1466 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1467 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1468 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1469 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1470 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1471 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1472 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1473 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1474 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1475 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1476 */
1477__kernel void winograd_output_transform_1x4_1x5_nchw(
1478 TENSOR3D_DECLARATION(src),
1479 TENSOR3D_DECLARATION(dst)
1480#if defined(HAS_BIAS)
1481 ,
1482 VECTOR_DECLARATION(bias)
1483#endif // defined(HAS_BIAS)
1484)
1485{
1486 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1487 src_stride_x,
1488 src_step_x,
1489 src_stride_y,
1490 src_step_y,
1491 src_stride_z,
1492 src_step_z,
1493 src_offset_first_element_in_bytes,
1494 dst_ptr,
1495 dst_stride_x,
1496 dst_step_x,
1497 dst_stride_y,
1498 dst_step_y,
1499 dst_stride_z,
1500 dst_step_z,
1501 dst_offset_first_element_in_bytes
1502#if defined(HAS_BIAS)
1503 ,
1504 bias_ptr,
1505 bias_stride_x,
1506 bias_step_x,
1507 bias_offset_first_element_in_bytes
1508#endif // defined(HAS_BIAS)
1509 );
1510}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001511
1512/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
1513 *
1514 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1515 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1516 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1517 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001518 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001519 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001520 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001521 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1522 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1523 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1524 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1525 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1526 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1527 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1528 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1529 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1530 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1531 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1532 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1533 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1534 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1535 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1536 */
1537__kernel void winograd_output_transform_1x4_1x3_nhwc(
1538 TENSOR3D_DECLARATION(src),
1539 TENSOR3D_DECLARATION(dst),
1540#if defined(HAS_BIAS)
1541 VECTOR_DECLARATION(bias),
1542#endif // defined(HAS_BIAS)
1543 int dst_size)
1544{
1545 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1546 src_stride_x,
1547 src_step_x,
1548 src_stride_y,
1549 src_step_y,
1550 src_stride_z,
1551 src_step_z,
1552 src_offset_first_element_in_bytes,
1553 dst_ptr,
1554 dst_stride_x,
1555 dst_step_x,
1556 dst_stride_y,
1557 dst_step_y,
1558 dst_stride_z,
1559 dst_step_z,
1560 dst_offset_first_element_in_bytes,
1561#if defined(HAS_BIAS)
1562 bias_ptr,
1563 bias_stride_x,
1564 bias_step_x,
1565 bias_offset_first_element_in_bytes,
1566#endif // defined(HAS_BIAS)
1567 dst_size);
1568}
1569
1570/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1571 *
1572 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1573 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1574 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1575 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001576 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001577 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001578 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001579 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1580 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1581 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1582 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1583 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1584 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1585 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1586 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1587 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1588 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1589 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1590 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1592 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1593 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1594 */
1595__kernel void winograd_output_transform_1x4_1x5_nhwc(
1596 TENSOR3D_DECLARATION(src),
1597 TENSOR3D_DECLARATION(dst),
1598#if defined(HAS_BIAS)
1599 VECTOR_DECLARATION(bias),
1600#endif // defined(HAS_BIAS)
1601 int dst_size)
1602{
1603 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1604 src_stride_x,
1605 src_step_x,
1606 src_stride_y,
1607 src_step_y,
1608 src_stride_z,
1609 src_step_z,
1610 src_offset_first_element_in_bytes,
1611 dst_ptr,
1612 dst_stride_x,
1613 dst_step_x,
1614 dst_stride_y,
1615 dst_step_y,
1616 dst_stride_z,
1617 dst_step_z,
1618 dst_offset_first_element_in_bytes,
1619#if defined(HAS_BIAS)
1620 bias_ptr,
1621 bias_stride_x,
1622 bias_step_x,
1623 bias_offset_first_element_in_bytes,
1624#endif // defined(HAS_BIAS)
1625 dst_size);
1626}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001627#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001628#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)