blob: efd8502657a98dade3e4e34ba8ecfe101208a946 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Michalis Spyrouef6ec502020-07-31 11:38:36 +01002 * Copyright (c) 2018-2020 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottini0d0028c2018-10-02 16:41:52 +010027
Georgios Pinitasffb57a02018-10-29 18:01:52 +000028#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010030/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
36 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010037 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Usama Arif6a98a6e2019-05-10 17:07:27 +010038 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottini0d0028c2018-10-02 16:41:52 +010039 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
40 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010049 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
50 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010057 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
60 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 */
63__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010064 TENSOR4D_DECLARATION(src),
65 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010066#if defined(HAS_BIAS)
67 ,
68 VECTOR_DECLARATION(bias)
69#endif // defined(HAS_BIAS)
70)
71{
72 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000073#if defined(SRC_DEPTH)
74 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010075 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000076#else /* defined(SRC_DEPTH) */
77 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
78 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
79#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010082 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
83 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
84 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
85 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 // Compute the 2x1 or 1x2 output tile
89 // out00 = d00 + d01 + d02
90 // out01 = d01 - d02 - d03
91
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000092 float out00 = d00 + d01 + d02;
93 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010095
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010096 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
97 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
98 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
99 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100101 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
102 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
103 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
104 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100105
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100106 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
107 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
108 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
109 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100110
111 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000112 float k0 = d01 + d11 + d21;
113 float k1 = d02 + d12 + d22;
114 float k2 = d11 - d21 - d31;
115 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
118 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
119 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
120 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000122 float out00 = d10;
123 float out01 = -d13;
124 float out10 = d10;
125 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100126
127 out00 += d00 + d20 + k0 + k1;
128 out01 += k0 - k1 - (d03 + d23);
129 out10 += -d20 - d30 + k2 + k3;
130 out11 += k2 - k3 + d23 + d33;
131#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132
133 int y_in = get_global_id(1);
134 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
135 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
136 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000137#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100138 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000139#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140
141#if defined(HAS_BIAS)
142 // Add bias
143 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
144
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000145 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000147 out00 += (float)b;
148 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
151 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000152#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100153 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#else /* defined(SRC_DEPTH) */
155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
156#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157
158 // Store the output tile
159#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100160 const VEC_DATA_TYPE(DATA_TYPE, 2)
161 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100162 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
163 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100165 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
166 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168
169#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
170#if defined(HAS_BIAS)
171 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 out10 += (DATA_TYPE)b;
173 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100174#endif // defined(HAS_BIAS)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100175 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
176 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100177#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
178}
giuros013bfacb22019-04-01 12:07:02 +0100179
180#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
181 ({ \
182 col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
183 col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
184 })
185
186/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
187 *
188 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
189 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
190 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
191 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
192 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
193 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
194 *
195 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
196 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
197 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
198 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
199 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
200 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
201 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
202 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
203 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
204 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
205 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
206 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
207 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
208 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
209 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
210 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
211 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
212 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
213 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
214 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
215 */
216__kernel void winograd_output_transform_2x2_7x7_nhwc(
217 TENSOR4D_DECLARATION(src),
218 TENSOR4D_DECLARATION(dst),
219#if defined(HAS_BIAS)
220 VECTOR_DECLARATION(bias),
221#endif // defined(HAS_BIAS)
222 int dst_size)
223{
224 // Each thread stores a 4x4/4x1 or 1x4 tile
225#if defined(SRC_DEPTH)
226 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
227 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
228#else /* defined(SRC_DEPTH) */
229 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
230 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
231#endif /* defined(SRC_DEPTH) */
232
233 int y_in = get_global_id(1);
234 int x_out = get_global_id(0);
235 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
236 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
237#if defined(SRC_DEPTH)
238 int batch = get_global_id(2) / SRC_DEPTH;
239#endif /* defined(SRC_DEPTH) */
240
241#if defined(SRC_DEPTH)
242 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
243#else /* defined(SRC_DEPTH) */
244
245 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
246#endif /* defined(SRC_DEPTH) */
247
248 // Load the values across the channels to compose the input tile
249 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
250 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
251 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
252 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
253 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
254 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
255 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
256 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
257
258#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
259 // Compute out00, out01, out02 and out03
260 float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
261 float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
262
263#if defined(HAS_BIAS)
264 // Add bias
265 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
266
267 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
268
269 out00 += (float)b;
270 out01 += (float)b;
271#endif // defined(HAS_BIAS)
272
273 // Store the output tile
274#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
275 // Get output address
276#if defined(SRC_DEPTH)
277 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
278#else /* defined(SRC_DEPTH) */
279 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
280#endif /* defined(SRC_DEPTH) */
281 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
282
283 VEC_DATA_TYPE(DATA_TYPE, 2)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100284 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100285 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
286 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
287#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
288 // Get output address
289 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
290 VEC_DATA_TYPE(DATA_TYPE, 2)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100291 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100292 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
293 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
294#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
295
296#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
297
298 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
299 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
300 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
301 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
302 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
303 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
304 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
305 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
306
307 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
308 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
309 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
310 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
311 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
312 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
313 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
314 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
315
316 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
317 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
318 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
319 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
320 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
321 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
322 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
323 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
324
325 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
326 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
327 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
328 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
329 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
330 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
331 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
332 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
333
334 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
335 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
336 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
337 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
338 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
339 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
340 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
341 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
342
343 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
344 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
345 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
346 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
347 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
348 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
349 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
350 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
351
352 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
353 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
354 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
355 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
356 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
357 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
358 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
359 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
360
361 // Compute the 8x2 intermediate tensor
362 VEC_DATA_TYPE(float, 2)
363 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
364
365 COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
366 COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
367 COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
368 COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
369 COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
370 COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
371 COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
372 COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
373
374 // Compute the 2x2 output tile
375 VEC_DATA_TYPE(float, 2)
376 out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
377 VEC_DATA_TYPE(float, 2)
378 out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
379
380#if defined(HAS_BIAS)
381 // Add bias
382 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
383
384 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
385
386 out_col0 += (VEC_DATA_TYPE(float, 2))b;
387 out_col1 += (VEC_DATA_TYPE(float, 2))b;
388
389#endif // defined(HAS_BIAS)
390 // Get output address
391#if defined(SRC_DEPTH)
392 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
393#else /* defined(SRC_DEPTH) */
394 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
395#endif /* defined(SRC_DEPTH) */
396 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
397 int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
398
399 // Store the output tile
400 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100401 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100402 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100403 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100404
405 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
406 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
407
408 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
409 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
410
411#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
412}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100413#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100414
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100415#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100416/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
417 *
418 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
419 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
420 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
421 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
422 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100423 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100424 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100425 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100426 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
427 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
429 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
430 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
431 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100432 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
433 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100434 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
435 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
436 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
437 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
438 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
439 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100440 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
441 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
442 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
443 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100444 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
445 */
446__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100447 TENSOR4D_DECLARATION(src),
448 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100449#if defined(HAS_BIAS)
450 ,
451 VECTOR_DECLARATION(bias)
452#endif // defined(HAS_BIAS)
453)
454{
455 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000456#if defined(SRC_DEPTH)
457 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100458 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000459#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100460 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
461 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000462#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100463
464 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100465 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
466 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
467 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
468 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
469 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
470 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100471
472#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
473 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000474 float out00 = d00 + d01 + d02 + d03 + d04;
475 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
476 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
477 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100478#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100479
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100480 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
481 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
482 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
483 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
484 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
485 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100486
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100487 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
488 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
489 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
490 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
491 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
492 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100493
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100494 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
495 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
496 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
497 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
498 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
499 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100500
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100501 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
502 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
503 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
504 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
505 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
506 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100507
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100508 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
509 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
510 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
511 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
512 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
513 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100514
515 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000516 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
517 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
518 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
519 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100520
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000521 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
522 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100523
524 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
525 out01 += k1 - d02 - d12 - d22 - d32 - d42;
526 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
527 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
528
529 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000530 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
531 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
532 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
533 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100534
535 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
536 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
537
538 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
539 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
540 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
541 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
542
543 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000544 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
545 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
546 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
547 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100548
549 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
550 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
551
552 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
553 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
554 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
555 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
556
557 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000558 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
559 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
560 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
561 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100562
563 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
564 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
565
566 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
567 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
568 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
569 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
570#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
571
572 int y_in = get_global_id(1);
573 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
574 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
575 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000576#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100577 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000578#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100579
580#if defined(HAS_BIAS)
581 // Add bias
582 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
583
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000584 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100585
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000586 out00 += (float)b;
587 out01 += (float)b;
588 out02 += (float)b;
589 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100590#endif // defined(HAS_BIAS)
591
592 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000593#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100594 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000595#else /* defined(SRC_DEPTH) */
596 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
597#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100598
599 // Store the output tile
600#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100601 VEC_DATA_TYPE(DATA_TYPE, 4)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100602 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
603 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100604 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
605 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
606 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
607 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100608#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100609 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
610 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100611#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
612
613#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
614#if defined(HAS_BIAS)
615 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000616 out10 += (float)b;
617 out11 += (float)b;
618 out12 += (float)b;
619 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100620
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000621 out20 += (float)b;
622 out21 += (float)b;
623 out22 += (float)b;
624 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100625
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000626 out30 += (float)b;
627 out31 += (float)b;
628 out32 += (float)b;
629 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100630#endif // defined(HAS_BIAS)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100631 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
632 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
633 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
634 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
635 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
636 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100637#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
638}
639
Giorgio Arena149fdf32018-07-04 17:03:33 +0100640/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100641 *
642 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100643 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
644 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
645 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
646 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100647 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100648 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100649 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100650 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
651 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
652 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
653 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
654 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
655 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100656 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
657 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100658 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
659 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
660 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
661 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
662 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
663 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100664 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
665 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
666 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
667 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100668 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
669 * @param[in] dst_size Size of the destination tensor, minus the last padding
670 */
671__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100672 TENSOR4D_DECLARATION(src),
673 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100674#if defined(HAS_BIAS)
675 VECTOR_DECLARATION(bias),
676#endif // defined(HAS_BIAS)
677 int dst_size)
678{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100679 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000680#if defined(SRC_DEPTH)
681 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100682 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000683#else /* defined(SRC_DEPTH) */
684 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
685 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
686#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100687
Giorgio Arena149fdf32018-07-04 17:03:33 +0100688 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100689 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
690 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
691 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
692 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
693 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
694 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100695
Giorgio Arena149fdf32018-07-04 17:03:33 +0100696#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
697 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000698 float out00 = d00 + d01 + d02 + d03 + d04;
699 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
700 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
701 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100702#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
703
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100704 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
705 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
706 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
707 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
708 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
709 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100710
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100711 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
712 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
713 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
714 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
715 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
716 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100717
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100718 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
719 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
720 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
721 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
722 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
723 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100724
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100725 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
726 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
727 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
728 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
729 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
730 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100731
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100732 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
733 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
734 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
735 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
736 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
737 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100738
739 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000740 float out00 = d01 + d21 + d41 + d11 + d31;
741 float out01 = d01 + d21 + d41 + d11 + d31;
742 float out02 = d01 + d21 + d41 + d11 + d31;
743 float out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100744
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000745 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
746 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100747
748 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
749 out01 += k1 - d02 - d12 - d22 - d32 - d42;
750 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
751 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
752
753 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000754 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
755 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
756 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
757 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100758
759 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
760 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
761
762 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
763 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
764 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
765 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
766
767 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000768 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
769 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
770 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
771 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100772
773 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
774 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
775
776 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
777 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
778 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
779 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
780
781 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000782 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
783 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
784 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
785 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100786
787 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
788 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
789
790 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
791 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
792 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
793 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100794#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100795
796 int y_in = get_global_id(1);
797 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100798 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
799 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000800#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100801 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000802#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100803
804#if defined(HAS_BIAS)
805 // Add bias
806 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
807
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100808 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100809
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100810 out00 += (DATA_TYPE)b;
811 out01 += (DATA_TYPE)b;
812 out02 += (DATA_TYPE)b;
813 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100814#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100815 out10 += (DATA_TYPE)b;
816 out11 += (DATA_TYPE)b;
817 out12 += (DATA_TYPE)b;
818 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100819
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100820 out20 += (DATA_TYPE)b;
821 out21 += (DATA_TYPE)b;
822 out22 += (DATA_TYPE)b;
823 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100824
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100825 out30 += (DATA_TYPE)b;
826 out31 += (DATA_TYPE)b;
827 out32 += (DATA_TYPE)b;
828 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100829#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100830
831#endif // defined(HAS_BIAS)
832
Giorgio Arena149fdf32018-07-04 17:03:33 +0100833#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000834#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100835 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100836#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100837 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100838#endif /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000839 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100840
841 // Store the 1x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100842 VEC_DATA_TYPE(DATA_TYPE, 4)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100843 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100844 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
845 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
846 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
847 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100848#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
849 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100850 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100851 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100852
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100853 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100854 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
855 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100856 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
857 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
858 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
859 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100860#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100861 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000862#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100863 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000864#else /* defined(SRC_DEPTH) */
865 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
866#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100867 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100868 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100869
870 // Store the 4x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100871 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100872 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100873 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100874 out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100875 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100876 out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100877 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100878 out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100879 VEC_DATA_TYPE(DATA_TYPE, 4)),
Usama Arif6a98a6e2019-05-10 17:07:27 +0100880 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100881 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
882 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
883 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
884 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
885 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
886 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
887 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
888 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
889 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
890 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
891 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
892 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
893 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
894 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
895 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
896 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100897
898#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100899}
900
901#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
902 ({ \
903 comm_fact.s0 = d1 + d2; \
904 comm_fact.s1 = d3 + d4; \
905 comm_fact.s2 = d5 + d6; \
906 \
907 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
908 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
909 \
910 comm_fact.s0 = d1 - d2; \
911 comm_fact.s1 = d3 - d4; \
912 comm_fact.s2 = d5 - d6; \
913 \
914 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
915 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
916 })
917
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100918/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100919 *
920 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100921 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
922 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
923 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
924 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100925 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100926 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100927 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100928 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
929 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
930 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
931 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
932 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
933 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100934 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
935 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100936 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
937 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
938 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
939 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
940 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
941 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100942 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
943 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
944 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
945 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100946 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
947 */
948__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100949 TENSOR4D_DECLARATION(src),
950 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100951#if defined(HAS_BIAS)
952 ,
953 VECTOR_DECLARATION(bias)
954#endif // defined(HAS_BIAS)
955)
956{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100957 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000958#if defined(SRC_DEPTH)
959 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100960 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000961#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100962
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000963 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
964 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
965#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100966
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100967 // Compute output address
968 int y_in = get_global_id(1);
969 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
970 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
971 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000972#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100973 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000974#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100975
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000976#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100977 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000978#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100979
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000980 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
981#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100982
983 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100984 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
985 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
986 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
987 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
988 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
989 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
990 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
991 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100992
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100993#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
994 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000995 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
996 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
997 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
998 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100999
1000#if defined(HAS_BIAS)
1001 // Add bias
1002 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1003
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001004 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001005
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001006 out00 += (DATA_TYPE)b;
1007 out01 += (DATA_TYPE)b;
1008 out02 += (DATA_TYPE)b;
1009 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001010#endif // defined(HAS_BIAS)
1011
1012 // Store the output tile
1013#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001014 VEC_DATA_TYPE(DATA_TYPE, 4)
Michalis Spyrouef6ec502020-07-31 11:38:36 +01001015 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1016 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001017 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
1018 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
1019 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
1020 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001021#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001022 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
1023 (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001024#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1025
1026#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001027
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001028 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1029 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1030 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1031 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1032 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1033 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1034 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1035 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001036
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001037 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1038 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1039 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1040 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1041 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1042 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1043 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1044 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001045
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001046 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1047 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1048 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1049 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1050 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1051 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1052 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1053 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001054
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001055 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1056 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1057 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1058 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1059 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1060 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1061 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1062 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001063
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001064 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1065 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1066 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1067 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1068 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1069 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1070 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1071 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001072
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001073 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1074 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1075 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1076 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1077 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1078 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1079 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1080 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001081
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001082 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1083 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1084 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1085 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1086 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1087 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1088 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1089 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001090
1091 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001092 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001093 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001094 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001095 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001096
1097 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1098 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1099 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1100 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1101 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1102 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1103 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1104 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1105
1106 // Compute the 4x4 output tile
1107 comm_fact0 = tmp_col1 + tmp_col2;
1108 comm_fact1 = tmp_col3 + tmp_col4;
1109 comm_fact2 = tmp_col5 + tmp_col6;
1110
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001111 VEC_DATA_TYPE(float, 4)
1112 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
1113 VEC_DATA_TYPE(float, 4)
1114 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001115
1116 comm_fact0 = tmp_col1 - tmp_col2;
1117 comm_fact1 = tmp_col3 - tmp_col4;
1118 comm_fact2 = tmp_col5 - tmp_col6;
1119
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001120 VEC_DATA_TYPE(float, 4)
1121 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
1122 VEC_DATA_TYPE(float, 4)
1123 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001124
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001125#if defined(HAS_BIAS)
1126 // Add bias
1127 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1128
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001129 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001130
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001131 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1132 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1133 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1134 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001135#endif // defined(HAS_BIAS)
1136
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001137 // Store the output tile
Usama Arif6a98a6e2019-05-10 17:07:27 +01001138 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
1139 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
1140 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
1141 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
1142 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
1143 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
1144 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
1145 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001146#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001147}
1148
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001149/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001150 *
1151 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001152 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1153 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1154 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1155 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001156 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001157 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001158 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001159 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1160 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1161 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1162 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1163 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1164 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001165 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1166 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001167 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1168 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1169 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1170 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1171 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1172 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001173 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1174 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1175 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1176 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001177 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1178 */
1179__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001180 TENSOR4D_DECLARATION(src),
1181 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001182#if defined(HAS_BIAS)
1183 VECTOR_DECLARATION(bias),
1184#endif // defined(HAS_BIAS)
1185 int dst_size)
1186{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001187 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001188#if defined(SRC_DEPTH)
1189 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001190 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001191#else /* defined(SRC_DEPTH) */
1192 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1193 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
1194#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001195
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001196 int y_in = get_global_id(1);
1197 int x_out = get_global_id(0);
1198 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
1199 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001200#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001201 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001202#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001203
1204 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001205 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1206 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1207 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1208 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1209 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1210 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1211 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1212 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001213
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001214#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1215 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001216 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
1217 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
1218 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1219 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001220
1221#if defined(HAS_BIAS)
1222 // Add bias
1223 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1224
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001225 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001226
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001227 out00 += (float)b;
1228 out01 += (float)b;
1229 out02 += (float)b;
1230 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001231#endif // defined(HAS_BIAS)
1232
1233 // Store the output tile
1234#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1235 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001236#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001237 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001238#else /* defined(SRC_DEPTH) */
1239 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1240#endif /* defined(SRC_DEPTH) */
1241 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001242
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001243 VEC_DATA_TYPE(DATA_TYPE, 4)
Michalis Spyrouef6ec502020-07-31 11:38:36 +01001244 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001245 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
1246 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
1247 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
1248 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001249#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1250 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001251 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001252 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001253 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1254 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001255 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
1256 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
1257 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
1258 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001259#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1260
1261#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1262
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001263 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1264 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1265 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1266 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1267 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1268 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1269 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1270 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001271
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001272 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1273 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1274 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1275 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1276 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1277 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1278 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1279 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001280
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001281 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1282 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1283 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1284 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1285 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1286 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1287 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1288 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001289
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001290 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1291 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1292 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1293 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1294 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1295 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1296 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1297 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001298
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001299 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1300 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1301 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1302 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1303 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1304 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1305 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1306 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001307
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001308 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1309 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1310 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1311 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1312 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1313 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1314 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1315 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001316
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001317 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1318 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1319 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1320 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1321 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1322 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1323 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1324 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001325
1326 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001327 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001328 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001329 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001330 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001331
1332 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1333 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1334 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1335 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1336 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1337 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1338 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1339 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1340
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001341 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001342 comm_fact0 = tmp_col1 + tmp_col2;
1343 comm_fact1 = tmp_col3 + tmp_col4;
1344 comm_fact2 = tmp_col5 + tmp_col6;
1345
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001346 VEC_DATA_TYPE(float, 4)
1347 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1348 VEC_DATA_TYPE(float, 4)
1349 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001350
1351 comm_fact0 = tmp_col1 - tmp_col2;
1352 comm_fact1 = tmp_col3 - tmp_col4;
1353 comm_fact2 = tmp_col5 - tmp_col6;
1354
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001355 VEC_DATA_TYPE(float, 4)
1356 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1357 VEC_DATA_TYPE(float, 4)
1358 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001359
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001360#if defined(HAS_BIAS)
1361 // Add bias
1362 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1363
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001364 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001365
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001366 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1367 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1368 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1369 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001370#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001371 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001372#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001373 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001374#else /* defined(SRC_DEPTH) */
1375 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1376#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001377 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1378 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001379
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001380 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001381 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001382 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001383 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001384 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001385 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001386 out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001387 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001388 out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001389
1390 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
1391 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
1392 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
1393 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
1394 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
1395 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
1396 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
1397 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
1398 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
1399 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
1400 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
1401 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
1402 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
1403 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
1404 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
1405 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001406#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001407}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001408#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001409
1410#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001411#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001412/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1413 *
1414 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1415 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1416 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1417 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001418 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001419 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001420 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001421 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1422 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1423 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1424 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1425 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1426 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001427 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1428 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001429 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1430 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1431 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1432 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1433 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1434 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001435 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1436 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1437 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1438 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001439 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1440 */
1441__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001442 TENSOR4D_DECLARATION(src),
1443 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001444#if defined(HAS_BIAS)
1445 ,
1446 VECTOR_DECLARATION(bias)
1447#endif // defined(HAS_BIAS)
1448)
1449{
1450 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1451 src_stride_x,
1452 src_step_x,
1453 src_stride_y,
1454 src_step_y,
1455 src_stride_z,
1456 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001457 src_stride_w,
1458 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001459 src_offset_first_element_in_bytes,
1460 dst_ptr,
1461 dst_stride_x,
1462 dst_step_x,
1463 dst_stride_y,
1464 dst_step_y,
1465 dst_stride_z,
1466 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001467 dst_stride_w,
1468 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001469 dst_offset_first_element_in_bytes
1470#if defined(HAS_BIAS)
1471 ,
1472 bias_ptr,
1473 bias_stride_x,
1474 bias_step_x,
1475 bias_offset_first_element_in_bytes
1476#endif // defined(HAS_BIAS)
1477 );
1478}
giuros013bfacb22019-04-01 12:07:02 +01001479
1480/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1481 *
1482 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1483 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1484 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1485 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1486 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1487 *
1488 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1489 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1490 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1491 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1492 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1493 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1494 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1495 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1496 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1497 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1498 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1499 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1500 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1501 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1502 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1503 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1504 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1505 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1506 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1507 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1508 */
1509__kernel void winograd_output_transform_2x1_7x1_nhwc(
1510 TENSOR4D_DECLARATION(src),
1511 TENSOR4D_DECLARATION(dst),
1512#if defined(HAS_BIAS)
1513 VECTOR_DECLARATION(bias),
1514#endif // defined(HAS_BIAS)
1515 int dst_size)
1516{
1517 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1518 src_stride_x,
1519 src_step_x,
1520 src_stride_y,
1521 src_step_y,
1522 src_stride_z,
1523 src_step_z,
1524 src_stride_w,
1525 src_step_w,
1526 src_offset_first_element_in_bytes,
1527 dst_ptr,
1528 dst_stride_x,
1529 dst_step_x,
1530 dst_stride_y,
1531 dst_step_y,
1532 dst_stride_z,
1533 dst_step_z,
1534 dst_stride_w,
1535 dst_step_w,
1536 dst_offset_first_element_in_bytes,
1537#if defined(HAS_BIAS)
1538 bias_ptr,
1539 bias_stride_x,
1540 bias_step_x,
1541 bias_offset_first_element_in_bytes,
1542#endif // defined(HAS_BIAS)
1543 dst_size);
1544}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001545#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001546
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001547#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001548/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1549 *
1550 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1551 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1552 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1553 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001554 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001555 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001556 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001557 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1558 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1559 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1560 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1561 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1562 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001563 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1564 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001565 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1566 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1567 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1568 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1569 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1570 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001571 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1572 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1573 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1574 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001575 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1576 */
1577__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001578 TENSOR4D_DECLARATION(src),
1579 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001580#if defined(HAS_BIAS)
1581 ,
1582 VECTOR_DECLARATION(bias)
1583#endif // defined(HAS_BIAS)
1584)
1585{
1586 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1587 src_stride_x,
1588 src_step_x,
1589 src_stride_y,
1590 src_step_y,
1591 src_stride_z,
1592 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001593 src_stride_w,
1594 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001595 src_offset_first_element_in_bytes,
1596 dst_ptr,
1597 dst_stride_x,
1598 dst_step_x,
1599 dst_stride_y,
1600 dst_step_y,
1601 dst_stride_z,
1602 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001603 dst_stride_w,
1604 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001605 dst_offset_first_element_in_bytes
1606#if defined(HAS_BIAS)
1607 ,
1608 bias_ptr,
1609 bias_stride_x,
1610 bias_step_x,
1611 bias_offset_first_element_in_bytes
1612#endif // defined(HAS_BIAS)
1613 );
1614}
1615
1616/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1617 *
1618 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1619 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1620 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1621 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001622 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001623 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001624 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001625 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1626 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1627 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1628 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1629 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1630 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001631 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1632 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001633 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1634 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1635 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1636 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1637 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1638 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001639 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1640 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1641 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1642 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001643 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1644 */
1645__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001646 TENSOR4D_DECLARATION(src),
1647 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001648#if defined(HAS_BIAS)
1649 ,
1650 VECTOR_DECLARATION(bias)
1651#endif // defined(HAS_BIAS)
1652)
1653{
1654 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1655 src_stride_x,
1656 src_step_x,
1657 src_stride_y,
1658 src_step_y,
1659 src_stride_z,
1660 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001661 src_stride_w,
1662 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001663 src_offset_first_element_in_bytes,
1664 dst_ptr,
1665 dst_stride_x,
1666 dst_step_x,
1667 dst_stride_y,
1668 dst_step_y,
1669 dst_stride_z,
1670 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001671 dst_stride_w,
1672 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001673 dst_offset_first_element_in_bytes
1674#if defined(HAS_BIAS)
1675 ,
1676 bias_ptr,
1677 bias_stride_x,
1678 bias_step_x,
1679 bias_offset_first_element_in_bytes
1680#endif // defined(HAS_BIAS)
1681 );
1682}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001683
1684/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1685 *
1686 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1687 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1688 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1689 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001690 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001691 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001692 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001693 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1694 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1695 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1696 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1697 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1698 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001699 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1700 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001701 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1702 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1703 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1704 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1705 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1706 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001707 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1708 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1709 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1710 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001711 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1712 */
1713__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001714 TENSOR4D_DECLARATION(src),
1715 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001716#if defined(HAS_BIAS)
1717 VECTOR_DECLARATION(bias),
1718#endif // defined(HAS_BIAS)
1719 int dst_size)
1720{
1721 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1722 src_stride_x,
1723 src_step_x,
1724 src_stride_y,
1725 src_step_y,
1726 src_stride_z,
1727 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001728 src_stride_w,
1729 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001730 src_offset_first_element_in_bytes,
1731 dst_ptr,
1732 dst_stride_x,
1733 dst_step_x,
1734 dst_stride_y,
1735 dst_step_y,
1736 dst_stride_z,
1737 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001738 dst_stride_w,
1739 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001740 dst_offset_first_element_in_bytes,
1741#if defined(HAS_BIAS)
1742 bias_ptr,
1743 bias_stride_x,
1744 bias_step_x,
1745 bias_offset_first_element_in_bytes,
1746#endif // defined(HAS_BIAS)
1747 dst_size);
1748}
1749
1750/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1751 *
1752 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1753 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1754 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1755 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001756 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001757 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001758 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001759 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1760 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1761 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1762 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1763 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1764 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001765 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1766 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001767 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1768 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1769 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1770 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1771 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1772 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001773 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1774 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1775 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1776 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001777 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1778 */
1779__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001780 TENSOR4D_DECLARATION(src),
1781 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001782#if defined(HAS_BIAS)
1783 VECTOR_DECLARATION(bias),
1784#endif // defined(HAS_BIAS)
1785 int dst_size)
1786{
1787 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1788 src_stride_x,
1789 src_step_x,
1790 src_stride_y,
1791 src_step_y,
1792 src_stride_z,
1793 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001794 src_stride_w,
1795 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001796 src_offset_first_element_in_bytes,
1797 dst_ptr,
1798 dst_stride_x,
1799 dst_step_x,
1800 dst_stride_y,
1801 dst_step_y,
1802 dst_stride_z,
1803 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001804 dst_stride_w,
1805 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001806 dst_offset_first_element_in_bytes,
1807#if defined(HAS_BIAS)
1808 bias_ptr,
1809 bias_stride_x,
1810 bias_step_x,
1811 bias_offset_first_element_in_bytes,
1812#endif // defined(HAS_BIAS)
1813 dst_size);
1814}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001815#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001816#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1817
1818#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001819#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001820/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1821 *
1822 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1823 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1824 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1825 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001826 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001827 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001828 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001829 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1830 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1831 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1832 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1833 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1834 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001835 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1836 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001837 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1838 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1839 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1840 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1841 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1842 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001843 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1844 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1845 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1846 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001847 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1848 */
1849__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001850 TENSOR4D_DECLARATION(src),
1851 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001852#if defined(HAS_BIAS)
1853 ,
1854 VECTOR_DECLARATION(bias)
1855#endif // defined(HAS_BIAS)
1856)
1857{
1858 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1859 src_stride_x,
1860 src_step_x,
1861 src_stride_y,
1862 src_step_y,
1863 src_stride_z,
1864 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001865 src_stride_w,
1866 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001867 src_offset_first_element_in_bytes,
1868 dst_ptr,
1869 dst_stride_x,
1870 dst_step_x,
1871 dst_stride_y,
1872 dst_step_y,
1873 dst_stride_z,
1874 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001875 dst_stride_w,
1876 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001877 dst_offset_first_element_in_bytes
1878#if defined(HAS_BIAS)
1879 ,
1880 bias_ptr,
1881 bias_stride_x,
1882 bias_step_x,
1883 bias_offset_first_element_in_bytes
1884#endif // defined(HAS_BIAS)
1885 );
1886}
giuros013bfacb22019-04-01 12:07:02 +01001887
1888/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1889 *
1890 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1891 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1892 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1893 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1894 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1895 *
1896 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1897 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1898 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1899 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1900 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1901 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1902 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1903 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1904 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1905 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1906 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1907 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1908 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1909 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1910 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1911 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1912 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1913 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1914 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1915 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1916 */
1917__kernel void winograd_output_transform_1x2_1x7_nhwc(
1918 TENSOR4D_DECLARATION(src),
1919 TENSOR4D_DECLARATION(dst),
1920#if defined(HAS_BIAS)
1921 VECTOR_DECLARATION(bias),
1922#endif // defined(HAS_BIAS)
1923 int dst_size)
1924{
1925 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1926 src_stride_x,
1927 src_step_x,
1928 src_stride_y,
1929 src_step_y,
1930 src_stride_z,
1931 src_step_z,
1932 src_stride_w,
1933 src_step_w,
1934 src_offset_first_element_in_bytes,
1935 dst_ptr,
1936 dst_stride_x,
1937 dst_step_x,
1938 dst_stride_y,
1939 dst_step_y,
1940 dst_stride_z,
1941 dst_step_z,
1942 dst_stride_w,
1943 dst_step_w,
1944 dst_offset_first_element_in_bytes,
1945#if defined(HAS_BIAS)
1946 bias_ptr,
1947 bias_stride_x,
1948 bias_step_x,
1949 bias_offset_first_element_in_bytes,
1950#endif // defined(HAS_BIAS)
1951 dst_size);
1952}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001953#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001954
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001955#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001956/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1957 *
1958 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1959 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1960 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1961 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001962 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001963 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001964 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001965 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1966 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1967 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1968 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1969 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1970 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001971 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1972 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001973 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1974 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1975 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1976 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1977 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1978 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001979 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1980 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1981 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1982 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001983 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1984 */
1985__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001986 TENSOR4D_DECLARATION(src),
1987 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001988#if defined(HAS_BIAS)
1989 ,
1990 VECTOR_DECLARATION(bias)
1991#endif // defined(HAS_BIAS)
1992)
1993{
1994 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1995 src_stride_x,
1996 src_step_x,
1997 src_stride_y,
1998 src_step_y,
1999 src_stride_z,
2000 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002001 src_stride_w,
2002 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002003 src_offset_first_element_in_bytes,
2004 dst_ptr,
2005 dst_stride_x,
2006 dst_step_x,
2007 dst_stride_y,
2008 dst_step_y,
2009 dst_stride_z,
2010 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002011 dst_stride_w,
2012 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002013 dst_offset_first_element_in_bytes
2014#if defined(HAS_BIAS)
2015 ,
2016 bias_ptr,
2017 bias_stride_x,
2018 bias_step_x,
2019 bias_offset_first_element_in_bytes
2020#endif // defined(HAS_BIAS)
2021 );
2022}
2023
2024/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
2025 *
2026 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2027 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2028 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2029 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002030 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002031 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002032 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002033 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2034 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2035 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2036 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2037 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2038 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002039 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2040 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002041 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2042 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2043 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2044 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2045 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2046 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002047 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2048 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2049 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2050 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002051 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2052 */
2053__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002054 TENSOR4D_DECLARATION(src),
2055 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002056#if defined(HAS_BIAS)
2057 ,
2058 VECTOR_DECLARATION(bias)
2059#endif // defined(HAS_BIAS)
2060)
2061{
2062 winograd_output_transform_4x4_5x5_nchw(src_ptr,
2063 src_stride_x,
2064 src_step_x,
2065 src_stride_y,
2066 src_step_y,
2067 src_stride_z,
2068 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002069 src_stride_w,
2070 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002071 src_offset_first_element_in_bytes,
2072 dst_ptr,
2073 dst_stride_x,
2074 dst_step_x,
2075 dst_stride_y,
2076 dst_step_y,
2077 dst_stride_z,
2078 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002079 dst_stride_w,
2080 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002081 dst_offset_first_element_in_bytes
2082#if defined(HAS_BIAS)
2083 ,
2084 bias_ptr,
2085 bias_stride_x,
2086 bias_step_x,
2087 bias_offset_first_element_in_bytes
2088#endif // defined(HAS_BIAS)
2089 );
2090}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002091
2092/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
2093 *
2094 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2095 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2096 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2097 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002098 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002099 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002100 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002101 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2102 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2103 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2104 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2105 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2106 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002107 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2108 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002109 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2110 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2111 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2112 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2113 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2114 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002115 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2116 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2117 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2118 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002119 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2120 */
2121__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002122 TENSOR4D_DECLARATION(src),
2123 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002124#if defined(HAS_BIAS)
2125 VECTOR_DECLARATION(bias),
2126#endif // defined(HAS_BIAS)
2127 int dst_size)
2128{
2129 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
2130 src_stride_x,
2131 src_step_x,
2132 src_stride_y,
2133 src_step_y,
2134 src_stride_z,
2135 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002136 src_stride_w,
2137 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002138 src_offset_first_element_in_bytes,
2139 dst_ptr,
2140 dst_stride_x,
2141 dst_step_x,
2142 dst_stride_y,
2143 dst_step_y,
2144 dst_stride_z,
2145 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002146 dst_stride_w,
2147 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002148 dst_offset_first_element_in_bytes,
2149#if defined(HAS_BIAS)
2150 bias_ptr,
2151 bias_stride_x,
2152 bias_step_x,
2153 bias_offset_first_element_in_bytes,
2154#endif // defined(HAS_BIAS)
2155 dst_size);
2156}
2157
2158/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
2159 *
2160 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2161 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2162 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2163 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002164 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002165 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002166 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002167 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2168 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2169 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2170 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2171 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2172 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002173 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2174 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002175 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2176 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2177 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2178 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2179 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2180 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002181 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2182 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2183 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2184 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002185 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2186 */
2187__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002188 TENSOR4D_DECLARATION(src),
2189 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002190#if defined(HAS_BIAS)
2191 VECTOR_DECLARATION(bias),
2192#endif // defined(HAS_BIAS)
2193 int dst_size)
2194{
2195 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
2196 src_stride_x,
2197 src_step_x,
2198 src_stride_y,
2199 src_step_y,
2200 src_stride_z,
2201 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002202 src_stride_w,
2203 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002204 src_offset_first_element_in_bytes,
2205 dst_ptr,
2206 dst_stride_x,
2207 dst_step_x,
2208 dst_stride_y,
2209 dst_step_y,
2210 dst_stride_z,
2211 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002212 dst_stride_w,
2213 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002214 dst_offset_first_element_in_bytes,
2215#if defined(HAS_BIAS)
2216 bias_ptr,
2217 bias_stride_x,
2218 bias_step_x,
2219 bias_offset_first_element_in_bytes,
2220#endif // defined(HAS_BIAS)
2221 dst_size);
2222}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002223#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002224#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002225#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)