blob: cffc12d6edf9a210345ce79e13550c7acab80f03 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
giuros013bfacb22019-04-01 12:07:02 +01002 * Copyright (c) 2018-2019 ARM Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Manuel Bottini0d0028c2018-10-02 16:41:52 +010026#if defined(FUSED_ACTIVATION)
27#include "activation_layer.cl"
28#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
29#else /* defined(FUSED_ACTIVATION) */
30#define ACTIVATION_FUNC(x) (x)
31#endif /* defined(FUSED_ACTIVATION) */
32
Georgios Pinitasffb57a02018-10-29 18:01:52 +000033#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010034#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010035/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
36 *
37 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
38 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
39 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
40 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
41 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Manuel Bottini0d0028c2018-10-02 16:41:52 +010043 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
44 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
45 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
46 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010047 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010048 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010049 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
50 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
51 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
52 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
53 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
54 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010055 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
56 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010057 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
58 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
59 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
60 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
61 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
62 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010063 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
64 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
65 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
66 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010067 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
68 */
69__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010070 TENSOR4D_DECLARATION(src),
71 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010072#if defined(HAS_BIAS)
73 ,
74 VECTOR_DECLARATION(bias)
75#endif // defined(HAS_BIAS)
76)
77{
78 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000079#if defined(SRC_DEPTH)
80 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010081 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000082#else /* defined(SRC_DEPTH) */
83 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
84 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
85#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010088 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
89 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
90 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
91 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010092
93#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
94 // Compute the 2x1 or 1x2 output tile
95 // out00 = d00 + d01 + d02
96 // out01 = d01 - d02 - d03
97
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000098 float out00 = d00 + d01 + d02;
99 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100101
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100102 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
103 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
104 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
105 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100106
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100107 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
108 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
109 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
110 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100111
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100112 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
113 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
114 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
115 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000118 float k0 = d01 + d11 + d21;
119 float k1 = d02 + d12 + d22;
120 float k2 = d11 - d21 - d31;
121 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100122
123 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
124 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
125 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
126 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
127
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000128 float out00 = d10;
129 float out01 = -d13;
130 float out10 = d10;
131 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100132
133 out00 += d00 + d20 + k0 + k1;
134 out01 += k0 - k1 - (d03 + d23);
135 out10 += -d20 - d30 + k2 + k3;
136 out11 += k2 - k3 + d23 + d33;
137#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
138
139 int y_in = get_global_id(1);
140 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
141 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
142 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000143#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100144 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000145#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
147#if defined(HAS_BIAS)
148 // Add bias
149 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
150
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000151 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100152
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000153 out00 += (float)b;
154 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100155#endif // defined(HAS_BIAS)
156
157 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000158#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100159 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000160#else /* defined(SRC_DEPTH) */
161 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
162#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100163
164 // Store the output tile
165#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100166 const const VEC_DATA_TYPE(DATA_TYPE, 2)
167 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
168 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
169 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100170#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100171 vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100172#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
173
174#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
175#if defined(HAS_BIAS)
176 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100177 out10 += (DATA_TYPE)b;
178 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100179#endif // defined(HAS_BIAS)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100180 vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100181#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
182}
giuros013bfacb22019-04-01 12:07:02 +0100183
184#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
185 ({ \
186 col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
187 col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
188 })
189
190/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
191 *
192 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
193 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
194 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
195 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
196 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
197 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
198 *
199 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
200 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
201 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
202 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
203 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
204 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
205 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
206 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
207 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
208 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
209 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
210 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
211 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
212 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
213 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
214 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
215 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
216 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
217 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
218 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
219 */
220__kernel void winograd_output_transform_2x2_7x7_nhwc(
221 TENSOR4D_DECLARATION(src),
222 TENSOR4D_DECLARATION(dst),
223#if defined(HAS_BIAS)
224 VECTOR_DECLARATION(bias),
225#endif // defined(HAS_BIAS)
226 int dst_size)
227{
228 // Each thread stores a 4x4/4x1 or 1x4 tile
229#if defined(SRC_DEPTH)
230 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
231 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
232#else /* defined(SRC_DEPTH) */
233 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
234 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
235#endif /* defined(SRC_DEPTH) */
236
237 int y_in = get_global_id(1);
238 int x_out = get_global_id(0);
239 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
240 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
241#if defined(SRC_DEPTH)
242 int batch = get_global_id(2) / SRC_DEPTH;
243#endif /* defined(SRC_DEPTH) */
244
245#if defined(SRC_DEPTH)
246 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
247#else /* defined(SRC_DEPTH) */
248
249 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
250#endif /* defined(SRC_DEPTH) */
251
252 // Load the values across the channels to compose the input tile
253 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
254 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
255 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
256 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
257 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
258 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
259 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
260 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
261
262#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
263 // Compute out00, out01, out02 and out03
264 float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
265 float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
266
267#if defined(HAS_BIAS)
268 // Add bias
269 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
270
271 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
272
273 out00 += (float)b;
274 out01 += (float)b;
275#endif // defined(HAS_BIAS)
276
277 // Store the output tile
278#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
279 // Get output address
280#if defined(SRC_DEPTH)
281 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
282#else /* defined(SRC_DEPTH) */
283 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
284#endif /* defined(SRC_DEPTH) */
285 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
286
287 VEC_DATA_TYPE(DATA_TYPE, 2)
288 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
289 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
290 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
291#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
292 // Get output address
293 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
294 VEC_DATA_TYPE(DATA_TYPE, 2)
295 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
296 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
297 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
298#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
299
300#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
301
302 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
303 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
304 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
305 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
306 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
307 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
308 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
309 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
310
311 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
312 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
313 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
314 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
315 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
316 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
317 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
318 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
319
320 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
321 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
322 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
323 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
324 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
325 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
326 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
327 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
328
329 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
330 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
331 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
332 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
333 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
334 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
335 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
336 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
337
338 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
339 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
340 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
341 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
342 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
343 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
344 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
345 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
346
347 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
348 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
349 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
350 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
351 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
352 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
353 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
354 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
355
356 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
357 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
358 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
359 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
360 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
361 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
362 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
363 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
364
365 // Compute the 8x2 intermediate tensor
366 VEC_DATA_TYPE(float, 2)
367 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
368
369 COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
370 COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
371 COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
372 COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
373 COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
374 COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
375 COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
376 COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
377
378 // Compute the 2x2 output tile
379 VEC_DATA_TYPE(float, 2)
380 out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
381 VEC_DATA_TYPE(float, 2)
382 out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
383
384#if defined(HAS_BIAS)
385 // Add bias
386 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
387
388 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
389
390 out_col0 += (VEC_DATA_TYPE(float, 2))b;
391 out_col1 += (VEC_DATA_TYPE(float, 2))b;
392
393#endif // defined(HAS_BIAS)
394 // Get output address
395#if defined(SRC_DEPTH)
396 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
397#else /* defined(SRC_DEPTH) */
398 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
399#endif /* defined(SRC_DEPTH) */
400 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
401 int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
402
403 // Store the output tile
404 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
405 out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
406 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
407 out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
408
409 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
410 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
411
412 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
413 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
414
415#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
416}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100417#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100418
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100419#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100420/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
421 *
422 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
423 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
424 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
425 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
426 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100427 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100428 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100429 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100430 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
431 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
432 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
433 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
434 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
435 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100436 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
437 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100438 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
439 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
440 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
441 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
442 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
443 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100444 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
445 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
446 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
447 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100448 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
449 */
450__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100451 TENSOR4D_DECLARATION(src),
452 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100453#if defined(HAS_BIAS)
454 ,
455 VECTOR_DECLARATION(bias)
456#endif // defined(HAS_BIAS)
457)
458{
459 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000460#if defined(SRC_DEPTH)
461 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100462 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000463#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100464 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
465 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000466#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100467
468 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100469 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
470 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
471 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
472 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
473 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
474 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100475
476#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
477 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000478 float out00 = d00 + d01 + d02 + d03 + d04;
479 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
480 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
481 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100482#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100483
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100484 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
485 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
486 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
487 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
488 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
489 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100490
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100491 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
492 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
493 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
494 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
495 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
496 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100497
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100498 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
499 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
500 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
501 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
502 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
503 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100505 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
506 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
507 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
508 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
509 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
510 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100511
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100512 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
513 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
514 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
515 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
516 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
517 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100518
519 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000520 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
521 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
522 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
523 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100524
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000525 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
526 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100527
528 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
529 out01 += k1 - d02 - d12 - d22 - d32 - d42;
530 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
531 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
532
533 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000534 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
535 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
536 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
537 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100538
539 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
540 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
541
542 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
543 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
544 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
545 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
546
547 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000548 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
549 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
550 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
551 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100552
553 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
554 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
555
556 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
557 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
558 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
559 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
560
561 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000562 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
563 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
564 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
565 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100566
567 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
568 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
569
570 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
571 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
572 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
573 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
574#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
575
576 int y_in = get_global_id(1);
577 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
578 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
579 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000580#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100581 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000582#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100583
584#if defined(HAS_BIAS)
585 // Add bias
586 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
587
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000588 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100589
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000590 out00 += (float)b;
591 out01 += (float)b;
592 out02 += (float)b;
593 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100594#endif // defined(HAS_BIAS)
595
596 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000597#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100598 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000599#else /* defined(SRC_DEPTH) */
600 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
601#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100602
603 // Store the output tile
604#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100605 VEC_DATA_TYPE(DATA_TYPE, 4)
606 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
607 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
608 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
609 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
610 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100611#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100612 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100613#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
614
615#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
616#if defined(HAS_BIAS)
617 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000618 out10 += (float)b;
619 out11 += (float)b;
620 out12 += (float)b;
621 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100622
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000623 out20 += (float)b;
624 out21 += (float)b;
625 out22 += (float)b;
626 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100627
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000628 out30 += (float)b;
629 out31 += (float)b;
630 out32 += (float)b;
631 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100632#endif // defined(HAS_BIAS)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100633 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
634 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
635 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100636#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
637}
638
Giorgio Arena149fdf32018-07-04 17:03:33 +0100639/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100640 *
641 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100642 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
643 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
644 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
645 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100646 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100647 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100648 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100649 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
650 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
651 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
652 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
653 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
654 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100655 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
656 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100657 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
658 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
659 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
660 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
662 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100663 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
664 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
665 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
666 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100667 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
668 * @param[in] dst_size Size of the destination tensor, minus the last padding
669 */
670__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100671 TENSOR4D_DECLARATION(src),
672 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100673#if defined(HAS_BIAS)
674 VECTOR_DECLARATION(bias),
675#endif // defined(HAS_BIAS)
676 int dst_size)
677{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100678 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000679#if defined(SRC_DEPTH)
680 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100681 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000682#else /* defined(SRC_DEPTH) */
683 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
684 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
685#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100686
Giorgio Arena149fdf32018-07-04 17:03:33 +0100687 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100688 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
689 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
690 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
691 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
692 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
693 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100694
Giorgio Arena149fdf32018-07-04 17:03:33 +0100695#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
696 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000697 float out00 = d00 + d01 + d02 + d03 + d04;
698 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
699 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
700 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100701#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
702
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100703 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
704 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
705 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
706 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
707 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
708 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100709
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100710 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
711 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
712 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
713 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
714 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
715 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100716
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100717 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
718 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
719 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
720 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
721 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
722 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100723
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
725 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
726 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
727 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
728 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
729 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100730
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100731 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
732 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
733 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
734 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
735 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
736 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100737
738 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000739 float out00 = d01 + d21 + d41 + d11 + d31;
740 float out01 = d01 + d21 + d41 + d11 + d31;
741 float out02 = d01 + d21 + d41 + d11 + d31;
742 float out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100743
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000744 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
745 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100746
747 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
748 out01 += k1 - d02 - d12 - d22 - d32 - d42;
749 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
750 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
751
752 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000753 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
754 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
755 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
756 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100757
758 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
759 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
760
761 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
762 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
763 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
764 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
765
766 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000767 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
768 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
769 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
770 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100771
772 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
773 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
774
775 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
776 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
777 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
778 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
779
780 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000781 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
782 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
783 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
784 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100785
786 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
787 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
788
789 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
790 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
791 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
792 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100793#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100794
795 int y_in = get_global_id(1);
796 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100797 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
798 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000799#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100800 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000801#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100802
803#if defined(HAS_BIAS)
804 // Add bias
805 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
806
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100807 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100808
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100809 out00 += (DATA_TYPE)b;
810 out01 += (DATA_TYPE)b;
811 out02 += (DATA_TYPE)b;
812 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100813#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100814 out10 += (DATA_TYPE)b;
815 out11 += (DATA_TYPE)b;
816 out12 += (DATA_TYPE)b;
817 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100818
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100819 out20 += (DATA_TYPE)b;
820 out21 += (DATA_TYPE)b;
821 out22 += (DATA_TYPE)b;
822 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100823
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100824 out30 += (DATA_TYPE)b;
825 out31 += (DATA_TYPE)b;
826 out32 += (DATA_TYPE)b;
827 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100828#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100829
830#endif // defined(HAS_BIAS)
831
Giorgio Arena149fdf32018-07-04 17:03:33 +0100832#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000833#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100834 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100835#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100836 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100837#endif /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000838 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100839
840 // Store the 1x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100841 VEC_DATA_TYPE(DATA_TYPE, 4)
842 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
843 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
844 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
845 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
846 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100847#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
848 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100849 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100850 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100851
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100852 VEC_DATA_TYPE(DATA_TYPE, 4)
853 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
854 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
855 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
856 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
857 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100858#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100859 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000860#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100861 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000862#else /* defined(SRC_DEPTH) */
863 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
864#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100865 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100866 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100867
868 // Store the 4x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100869 VEC_DATA_TYPE(DATA_TYPE, 4)
870 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
871 VEC_DATA_TYPE(DATA_TYPE, 4)
872 out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));
873 VEC_DATA_TYPE(DATA_TYPE, 4)
874 out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));
875 VEC_DATA_TYPE(DATA_TYPE, 4)
876 out3_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));
877 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
878 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
879 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
880 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
881 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
882 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
883 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
884 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
885 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
886 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
887 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
888 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
889 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
890 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
891 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
892 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100893
894#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100895}
896
897#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
898 ({ \
899 comm_fact.s0 = d1 + d2; \
900 comm_fact.s1 = d3 + d4; \
901 comm_fact.s2 = d5 + d6; \
902 \
903 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
904 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
905 \
906 comm_fact.s0 = d1 - d2; \
907 comm_fact.s1 = d3 - d4; \
908 comm_fact.s2 = d5 - d6; \
909 \
910 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
911 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
912 })
913
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100914/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100915 *
916 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100917 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
918 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
919 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
920 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100921 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100922 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100923 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100924 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
925 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
926 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
927 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
928 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
929 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100930 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
931 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100932 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
933 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
934 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
935 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
936 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
937 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100938 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
939 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
940 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
941 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100942 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
943 */
944__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100945 TENSOR4D_DECLARATION(src),
946 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100947#if defined(HAS_BIAS)
948 ,
949 VECTOR_DECLARATION(bias)
950#endif // defined(HAS_BIAS)
951)
952{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100953 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000954#if defined(SRC_DEPTH)
955 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100956 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000957#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100958
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000959 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
960 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
961#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100962
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100963 // Compute output address
964 int y_in = get_global_id(1);
965 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
966 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
967 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000968#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100969 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000970#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100971
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000972#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100973 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000974#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100975
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000976 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
977#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100978
979 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100980 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
981 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
982 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
983 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
984 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
985 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
986 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
987 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100988
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100989#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
990 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000991 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
992 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
993 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
994 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100995
996#if defined(HAS_BIAS)
997 // Add bias
998 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
999
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001000 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001001
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001002 out00 += (DATA_TYPE)b;
1003 out01 += (DATA_TYPE)b;
1004 out02 += (DATA_TYPE)b;
1005 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001006#endif // defined(HAS_BIAS)
1007
1008 // Store the output tile
1009#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001010 VEC_DATA_TYPE(DATA_TYPE, 4)
1011 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
1012 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
1013 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
1014 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
1015 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001016#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001017 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001018#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1019
1020#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001021
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001022 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1023 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1024 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1025 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1026 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1027 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1028 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1029 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001030
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001031 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1032 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1033 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1034 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1035 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1036 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1037 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1038 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001039
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001040 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1041 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1042 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1043 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1044 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1045 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1046 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1047 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001048
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001049 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1050 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1051 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1052 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1053 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1054 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1055 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1056 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001057
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001058 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1059 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1060 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1061 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1062 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1063 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1064 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1065 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001066
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001067 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1068 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1069 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1070 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1071 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1072 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1073 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1074 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001075
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001076 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1077 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1078 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1079 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1080 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1081 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1082 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1083 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001084
1085 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001086 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001087 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001088 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001089 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001090
1091 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1092 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1093 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1094 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1095 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1096 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1097 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1098 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1099
1100 // Compute the 4x4 output tile
1101 comm_fact0 = tmp_col1 + tmp_col2;
1102 comm_fact1 = tmp_col3 + tmp_col4;
1103 comm_fact2 = tmp_col5 + tmp_col6;
1104
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001105 VEC_DATA_TYPE(float, 4)
1106 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
1107 VEC_DATA_TYPE(float, 4)
1108 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001109
1110 comm_fact0 = tmp_col1 - tmp_col2;
1111 comm_fact1 = tmp_col3 - tmp_col4;
1112 comm_fact2 = tmp_col5 - tmp_col6;
1113
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001114 VEC_DATA_TYPE(float, 4)
1115 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
1116 VEC_DATA_TYPE(float, 4)
1117 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001118
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001119#if defined(HAS_BIAS)
1120 // Add bias
1121 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1122
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001123 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001124
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001125 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1126 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1127 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1128 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001129#endif // defined(HAS_BIAS)
1130
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001131 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001132 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
1133 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
1134 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
1135 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001136#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001137}
1138
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001139/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001140 *
1141 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001142 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1143 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1144 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1145 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001146 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001147 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001148 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001149 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1150 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1151 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1152 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1153 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1154 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001155 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1156 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001157 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1158 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001163 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1164 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1165 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1166 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001167 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1168 */
1169__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001170 TENSOR4D_DECLARATION(src),
1171 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001172#if defined(HAS_BIAS)
1173 VECTOR_DECLARATION(bias),
1174#endif // defined(HAS_BIAS)
1175 int dst_size)
1176{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001177 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001178#if defined(SRC_DEPTH)
1179 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001180 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001181#else /* defined(SRC_DEPTH) */
1182 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1183 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
1184#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001185
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001186 int y_in = get_global_id(1);
1187 int x_out = get_global_id(0);
1188 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
1189 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001190#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001191 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001192#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001193
1194 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001195 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1196 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1197 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1198 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1199 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1200 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1201 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1202 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001203
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001204#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1205 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001206 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
1207 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
1208 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1209 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001210
1211#if defined(HAS_BIAS)
1212 // Add bias
1213 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1214
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001215 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001216
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001217 out00 += (float)b;
1218 out01 += (float)b;
1219 out02 += (float)b;
1220 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001221#endif // defined(HAS_BIAS)
1222
1223 // Store the output tile
1224#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1225 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001226#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001227 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001228#else /* defined(SRC_DEPTH) */
1229 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1230#endif /* defined(SRC_DEPTH) */
1231 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001232
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001233 VEC_DATA_TYPE(DATA_TYPE, 4)
1234 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
1235 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
1236 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
1237 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
1238 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001239#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1240 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001241 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001242 VEC_DATA_TYPE(DATA_TYPE, 4)
1243 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
1244 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
1245 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
1246 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
1247 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001248#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1249
1250#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1251
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001252 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1253 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1254 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1255 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1256 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1257 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1258 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1259 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001260
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001261 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1262 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1263 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1264 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1265 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1266 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1267 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1268 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001269
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001270 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1271 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1272 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1273 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1274 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1275 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1276 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1277 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001278
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001279 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1280 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1281 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1282 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1283 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1284 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1285 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1286 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001287
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001288 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1289 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1290 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1291 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1292 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1293 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1294 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1295 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001296
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001297 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1298 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1299 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1300 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1301 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1302 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1303 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1304 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001305
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001306 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1307 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1308 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1309 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1310 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1311 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1312 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1313 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001314
1315 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001316 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001317 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001318 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001319 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001320
1321 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1322 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1323 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1324 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1325 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1326 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1327 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1328 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1329
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001330 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001331 comm_fact0 = tmp_col1 + tmp_col2;
1332 comm_fact1 = tmp_col3 + tmp_col4;
1333 comm_fact2 = tmp_col5 + tmp_col6;
1334
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001335 VEC_DATA_TYPE(float, 4)
1336 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1337 VEC_DATA_TYPE(float, 4)
1338 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001339
1340 comm_fact0 = tmp_col1 - tmp_col2;
1341 comm_fact1 = tmp_col3 - tmp_col4;
1342 comm_fact2 = tmp_col5 - tmp_col6;
1343
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001344 VEC_DATA_TYPE(float, 4)
1345 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1346 VEC_DATA_TYPE(float, 4)
1347 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001348
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001349#if defined(HAS_BIAS)
1350 // Add bias
1351 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1352
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001353 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001354
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001355 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1356 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1357 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1358 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001359#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001360 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001361#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001362 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001363#else /* defined(SRC_DEPTH) */
1364 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1365#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001366 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1367 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001368
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001369 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001370 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1371 out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1372 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1373 out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1374 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1375 out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1376 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1377 out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1378
1379 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
1380 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
1381 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
1382 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
1383 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
1384 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
1385 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
1386 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
1387 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
1388 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
1389 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
1390 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
1391 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
1392 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
1393 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
1394 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001395#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001396}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001397#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001398
1399#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001400#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001401/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1402 *
1403 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1404 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1405 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1406 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001407 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001408 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001409 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001410 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1411 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1412 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1413 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1414 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1415 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001416 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1417 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001418 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1419 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1420 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1421 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1422 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1423 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001424 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1425 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1426 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1427 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001428 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1429 */
1430__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001431 TENSOR4D_DECLARATION(src),
1432 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001433#if defined(HAS_BIAS)
1434 ,
1435 VECTOR_DECLARATION(bias)
1436#endif // defined(HAS_BIAS)
1437)
1438{
1439 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1440 src_stride_x,
1441 src_step_x,
1442 src_stride_y,
1443 src_step_y,
1444 src_stride_z,
1445 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001446 src_stride_w,
1447 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001448 src_offset_first_element_in_bytes,
1449 dst_ptr,
1450 dst_stride_x,
1451 dst_step_x,
1452 dst_stride_y,
1453 dst_step_y,
1454 dst_stride_z,
1455 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001456 dst_stride_w,
1457 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001458 dst_offset_first_element_in_bytes
1459#if defined(HAS_BIAS)
1460 ,
1461 bias_ptr,
1462 bias_stride_x,
1463 bias_step_x,
1464 bias_offset_first_element_in_bytes
1465#endif // defined(HAS_BIAS)
1466 );
1467}
giuros013bfacb22019-04-01 12:07:02 +01001468
1469/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1470 *
1471 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1472 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1473 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1474 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1475 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1476 *
1477 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1478 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1479 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1480 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1481 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1482 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1483 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1484 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1485 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1486 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1487 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1488 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1489 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1490 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1491 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1492 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1493 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1494 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1495 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1496 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1497 */
1498__kernel void winograd_output_transform_2x1_7x1_nhwc(
1499 TENSOR4D_DECLARATION(src),
1500 TENSOR4D_DECLARATION(dst),
1501#if defined(HAS_BIAS)
1502 VECTOR_DECLARATION(bias),
1503#endif // defined(HAS_BIAS)
1504 int dst_size)
1505{
1506 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1507 src_stride_x,
1508 src_step_x,
1509 src_stride_y,
1510 src_step_y,
1511 src_stride_z,
1512 src_step_z,
1513 src_stride_w,
1514 src_step_w,
1515 src_offset_first_element_in_bytes,
1516 dst_ptr,
1517 dst_stride_x,
1518 dst_step_x,
1519 dst_stride_y,
1520 dst_step_y,
1521 dst_stride_z,
1522 dst_step_z,
1523 dst_stride_w,
1524 dst_step_w,
1525 dst_offset_first_element_in_bytes,
1526#if defined(HAS_BIAS)
1527 bias_ptr,
1528 bias_stride_x,
1529 bias_step_x,
1530 bias_offset_first_element_in_bytes,
1531#endif // defined(HAS_BIAS)
1532 dst_size);
1533}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001534#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001535
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001536#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001537/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1538 *
1539 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1540 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1541 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1542 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001543 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001544 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001545 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001546 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1547 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1548 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1549 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1550 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1551 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001552 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1553 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001554 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1555 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1556 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1557 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1558 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1559 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001560 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1561 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1562 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1563 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001564 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1565 */
1566__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001567 TENSOR4D_DECLARATION(src),
1568 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001569#if defined(HAS_BIAS)
1570 ,
1571 VECTOR_DECLARATION(bias)
1572#endif // defined(HAS_BIAS)
1573)
1574{
1575 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1576 src_stride_x,
1577 src_step_x,
1578 src_stride_y,
1579 src_step_y,
1580 src_stride_z,
1581 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001582 src_stride_w,
1583 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001584 src_offset_first_element_in_bytes,
1585 dst_ptr,
1586 dst_stride_x,
1587 dst_step_x,
1588 dst_stride_y,
1589 dst_step_y,
1590 dst_stride_z,
1591 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001592 dst_stride_w,
1593 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001594 dst_offset_first_element_in_bytes
1595#if defined(HAS_BIAS)
1596 ,
1597 bias_ptr,
1598 bias_stride_x,
1599 bias_step_x,
1600 bias_offset_first_element_in_bytes
1601#endif // defined(HAS_BIAS)
1602 );
1603}
1604
1605/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1606 *
1607 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1608 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1609 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1610 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001611 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001612 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001613 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001614 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1615 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1616 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1617 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1618 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1619 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001620 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1621 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001622 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1623 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1624 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1625 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1626 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1627 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001628 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1629 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1630 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1631 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001632 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1633 */
1634__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001635 TENSOR4D_DECLARATION(src),
1636 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001637#if defined(HAS_BIAS)
1638 ,
1639 VECTOR_DECLARATION(bias)
1640#endif // defined(HAS_BIAS)
1641)
1642{
1643 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1644 src_stride_x,
1645 src_step_x,
1646 src_stride_y,
1647 src_step_y,
1648 src_stride_z,
1649 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001650 src_stride_w,
1651 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001652 src_offset_first_element_in_bytes,
1653 dst_ptr,
1654 dst_stride_x,
1655 dst_step_x,
1656 dst_stride_y,
1657 dst_step_y,
1658 dst_stride_z,
1659 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001660 dst_stride_w,
1661 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001662 dst_offset_first_element_in_bytes
1663#if defined(HAS_BIAS)
1664 ,
1665 bias_ptr,
1666 bias_stride_x,
1667 bias_step_x,
1668 bias_offset_first_element_in_bytes
1669#endif // defined(HAS_BIAS)
1670 );
1671}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001672
1673/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1674 *
1675 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1676 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1677 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1678 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001679 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001680 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001681 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001682 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1683 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1684 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1685 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1686 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1687 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001688 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1689 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001690 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1691 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1692 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1693 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1694 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1695 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001696 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1697 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1698 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1699 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001700 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1701 */
1702__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001703 TENSOR4D_DECLARATION(src),
1704 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001705#if defined(HAS_BIAS)
1706 VECTOR_DECLARATION(bias),
1707#endif // defined(HAS_BIAS)
1708 int dst_size)
1709{
1710 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1711 src_stride_x,
1712 src_step_x,
1713 src_stride_y,
1714 src_step_y,
1715 src_stride_z,
1716 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001717 src_stride_w,
1718 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001719 src_offset_first_element_in_bytes,
1720 dst_ptr,
1721 dst_stride_x,
1722 dst_step_x,
1723 dst_stride_y,
1724 dst_step_y,
1725 dst_stride_z,
1726 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001727 dst_stride_w,
1728 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001729 dst_offset_first_element_in_bytes,
1730#if defined(HAS_BIAS)
1731 bias_ptr,
1732 bias_stride_x,
1733 bias_step_x,
1734 bias_offset_first_element_in_bytes,
1735#endif // defined(HAS_BIAS)
1736 dst_size);
1737}
1738
1739/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1740 *
1741 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1742 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1743 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1744 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001745 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001746 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001747 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001748 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1749 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1750 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1751 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1752 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1753 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001754 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1755 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001756 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1757 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1758 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1759 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1760 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1761 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001762 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1763 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1764 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1765 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001766 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1767 */
1768__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001769 TENSOR4D_DECLARATION(src),
1770 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001771#if defined(HAS_BIAS)
1772 VECTOR_DECLARATION(bias),
1773#endif // defined(HAS_BIAS)
1774 int dst_size)
1775{
1776 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1777 src_stride_x,
1778 src_step_x,
1779 src_stride_y,
1780 src_step_y,
1781 src_stride_z,
1782 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001783 src_stride_w,
1784 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001785 src_offset_first_element_in_bytes,
1786 dst_ptr,
1787 dst_stride_x,
1788 dst_step_x,
1789 dst_stride_y,
1790 dst_step_y,
1791 dst_stride_z,
1792 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001793 dst_stride_w,
1794 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001795 dst_offset_first_element_in_bytes,
1796#if defined(HAS_BIAS)
1797 bias_ptr,
1798 bias_stride_x,
1799 bias_step_x,
1800 bias_offset_first_element_in_bytes,
1801#endif // defined(HAS_BIAS)
1802 dst_size);
1803}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001804#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001805#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1806
1807#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001808#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001809/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1810 *
1811 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1812 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1813 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1814 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001815 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001816 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001817 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001818 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1819 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1820 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1821 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1822 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1823 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001824 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1825 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001826 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1827 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1828 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1829 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1830 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1831 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001832 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1833 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1834 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1835 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001836 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1837 */
1838__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001839 TENSOR4D_DECLARATION(src),
1840 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001841#if defined(HAS_BIAS)
1842 ,
1843 VECTOR_DECLARATION(bias)
1844#endif // defined(HAS_BIAS)
1845)
1846{
1847 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1848 src_stride_x,
1849 src_step_x,
1850 src_stride_y,
1851 src_step_y,
1852 src_stride_z,
1853 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001854 src_stride_w,
1855 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001856 src_offset_first_element_in_bytes,
1857 dst_ptr,
1858 dst_stride_x,
1859 dst_step_x,
1860 dst_stride_y,
1861 dst_step_y,
1862 dst_stride_z,
1863 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001864 dst_stride_w,
1865 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001866 dst_offset_first_element_in_bytes
1867#if defined(HAS_BIAS)
1868 ,
1869 bias_ptr,
1870 bias_stride_x,
1871 bias_step_x,
1872 bias_offset_first_element_in_bytes
1873#endif // defined(HAS_BIAS)
1874 );
1875}
giuros013bfacb22019-04-01 12:07:02 +01001876
1877/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1878 *
1879 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1880 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1881 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1882 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1883 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1884 *
1885 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1886 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1887 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1888 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1889 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1890 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1891 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1892 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1893 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1894 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1895 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1896 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1897 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1898 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1899 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1900 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1901 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1902 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1903 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1904 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1905 */
1906__kernel void winograd_output_transform_1x2_1x7_nhwc(
1907 TENSOR4D_DECLARATION(src),
1908 TENSOR4D_DECLARATION(dst),
1909#if defined(HAS_BIAS)
1910 VECTOR_DECLARATION(bias),
1911#endif // defined(HAS_BIAS)
1912 int dst_size)
1913{
1914 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1915 src_stride_x,
1916 src_step_x,
1917 src_stride_y,
1918 src_step_y,
1919 src_stride_z,
1920 src_step_z,
1921 src_stride_w,
1922 src_step_w,
1923 src_offset_first_element_in_bytes,
1924 dst_ptr,
1925 dst_stride_x,
1926 dst_step_x,
1927 dst_stride_y,
1928 dst_step_y,
1929 dst_stride_z,
1930 dst_step_z,
1931 dst_stride_w,
1932 dst_step_w,
1933 dst_offset_first_element_in_bytes,
1934#if defined(HAS_BIAS)
1935 bias_ptr,
1936 bias_stride_x,
1937 bias_step_x,
1938 bias_offset_first_element_in_bytes,
1939#endif // defined(HAS_BIAS)
1940 dst_size);
1941}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001942#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001943
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001944#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001945/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1946 *
1947 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1948 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1949 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1950 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001951 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001952 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001953 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001954 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1955 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1956 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1957 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1958 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1959 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001960 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1961 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001962 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1963 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1964 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1965 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1966 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1967 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001968 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1969 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1970 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1971 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001972 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1973 */
1974__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001975 TENSOR4D_DECLARATION(src),
1976 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001977#if defined(HAS_BIAS)
1978 ,
1979 VECTOR_DECLARATION(bias)
1980#endif // defined(HAS_BIAS)
1981)
1982{
1983 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1984 src_stride_x,
1985 src_step_x,
1986 src_stride_y,
1987 src_step_y,
1988 src_stride_z,
1989 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001990 src_stride_w,
1991 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001992 src_offset_first_element_in_bytes,
1993 dst_ptr,
1994 dst_stride_x,
1995 dst_step_x,
1996 dst_stride_y,
1997 dst_step_y,
1998 dst_stride_z,
1999 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002000 dst_stride_w,
2001 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002002 dst_offset_first_element_in_bytes
2003#if defined(HAS_BIAS)
2004 ,
2005 bias_ptr,
2006 bias_stride_x,
2007 bias_step_x,
2008 bias_offset_first_element_in_bytes
2009#endif // defined(HAS_BIAS)
2010 );
2011}
2012
2013/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
2014 *
2015 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2016 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2017 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2018 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002019 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002020 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002021 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002022 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2023 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2024 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2025 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2026 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2027 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002028 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2029 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002030 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2031 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2032 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2033 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2034 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2035 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002036 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2037 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2038 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2039 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002040 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2041 */
2042__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002043 TENSOR4D_DECLARATION(src),
2044 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002045#if defined(HAS_BIAS)
2046 ,
2047 VECTOR_DECLARATION(bias)
2048#endif // defined(HAS_BIAS)
2049)
2050{
2051 winograd_output_transform_4x4_5x5_nchw(src_ptr,
2052 src_stride_x,
2053 src_step_x,
2054 src_stride_y,
2055 src_step_y,
2056 src_stride_z,
2057 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002058 src_stride_w,
2059 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002060 src_offset_first_element_in_bytes,
2061 dst_ptr,
2062 dst_stride_x,
2063 dst_step_x,
2064 dst_stride_y,
2065 dst_step_y,
2066 dst_stride_z,
2067 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002068 dst_stride_w,
2069 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002070 dst_offset_first_element_in_bytes
2071#if defined(HAS_BIAS)
2072 ,
2073 bias_ptr,
2074 bias_stride_x,
2075 bias_step_x,
2076 bias_offset_first_element_in_bytes
2077#endif // defined(HAS_BIAS)
2078 );
2079}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002080
2081/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
2082 *
2083 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2084 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2085 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2086 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002087 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002088 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002089 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002090 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2091 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2092 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2093 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2094 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2095 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002096 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2097 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002098 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2099 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2100 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2101 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2102 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2103 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002104 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2105 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2106 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2107 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002108 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2109 */
2110__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002111 TENSOR4D_DECLARATION(src),
2112 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002113#if defined(HAS_BIAS)
2114 VECTOR_DECLARATION(bias),
2115#endif // defined(HAS_BIAS)
2116 int dst_size)
2117{
2118 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
2119 src_stride_x,
2120 src_step_x,
2121 src_stride_y,
2122 src_step_y,
2123 src_stride_z,
2124 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002125 src_stride_w,
2126 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002127 src_offset_first_element_in_bytes,
2128 dst_ptr,
2129 dst_stride_x,
2130 dst_step_x,
2131 dst_stride_y,
2132 dst_step_y,
2133 dst_stride_z,
2134 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002135 dst_stride_w,
2136 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002137 dst_offset_first_element_in_bytes,
2138#if defined(HAS_BIAS)
2139 bias_ptr,
2140 bias_stride_x,
2141 bias_step_x,
2142 bias_offset_first_element_in_bytes,
2143#endif // defined(HAS_BIAS)
2144 dst_size);
2145}
2146
2147/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
2148 *
2149 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2150 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2151 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2152 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002153 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002154 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002155 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002156 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2157 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2158 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2159 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2160 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2161 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002162 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2163 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002164 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2165 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2166 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2167 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2168 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2169 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002170 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2171 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2172 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2173 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002174 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2175 */
2176__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002177 TENSOR4D_DECLARATION(src),
2178 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002179#if defined(HAS_BIAS)
2180 VECTOR_DECLARATION(bias),
2181#endif // defined(HAS_BIAS)
2182 int dst_size)
2183{
2184 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
2185 src_stride_x,
2186 src_step_x,
2187 src_stride_y,
2188 src_step_y,
2189 src_stride_z,
2190 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002191 src_stride_w,
2192 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002193 src_offset_first_element_in_bytes,
2194 dst_ptr,
2195 dst_stride_x,
2196 dst_step_x,
2197 dst_stride_y,
2198 dst_step_y,
2199 dst_stride_z,
2200 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002201 dst_stride_w,
2202 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002203 dst_offset_first_element_in_bytes,
2204#if defined(HAS_BIAS)
2205 bias_ptr,
2206 bias_stride_x,
2207 bias_step_x,
2208 bias_offset_first_element_in_bytes,
2209#endif // defined(HAS_BIAS)
2210 dst_size);
2211}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002212#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002213#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002214#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)