blob: e735bbafb638ae58e9ea756690f8ad32f06a770b [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Michalis Spyrouef6ec502020-07-31 11:38:36 +01002 * Copyright (c) 2018-2020 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottini0d0028c2018-10-02 16:41:52 +010027
Georgios Pinitasffb57a02018-10-29 18:01:52 +000028#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010030/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
36 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010037 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Usama Arif6a98a6e2019-05-10 17:07:27 +010038 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottini0d0028c2018-10-02 16:41:52 +010039 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
40 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010049 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
50 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010057 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
60 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 */
63__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010064 TENSOR4D_DECLARATION(src),
65 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010066#if defined(HAS_BIAS)
67 ,
68 VECTOR_DECLARATION(bias)
69#endif // defined(HAS_BIAS)
70)
71{
72 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000073#if defined(SRC_DEPTH)
74 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010075 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000076#else /* defined(SRC_DEPTH) */
77 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
78 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
79#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010082 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
83 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
84 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
85 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 // Compute the 2x1 or 1x2 output tile
89 // out00 = d00 + d01 + d02
90 // out01 = d01 - d02 - d03
91
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000092 float out00 = d00 + d01 + d02;
93 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010095
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010096 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
97 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
98 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
99 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100101 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
102 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
103 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
104 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100105
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100106 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
107 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
108 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
109 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100110
111 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000112 float k0 = d01 + d11 + d21;
113 float k1 = d02 + d12 + d22;
114 float k2 = d11 - d21 - d31;
115 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
118 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
119 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
120 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000122 float out00 = d10;
123 float out01 = -d13;
124 float out10 = d10;
125 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100126
127 out00 += d00 + d20 + k0 + k1;
128 out01 += k0 - k1 - (d03 + d23);
129 out10 += -d20 - d30 + k2 + k3;
130 out11 += k2 - k3 + d23 + d33;
131#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132
133 int y_in = get_global_id(1);
134 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
135 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
136 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000137#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100138 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000139#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140
141#if defined(HAS_BIAS)
142 // Add bias
143 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
144
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000145 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000147 out00 += (float)b;
148 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
151 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000152#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100153 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#else /* defined(SRC_DEPTH) */
155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
156#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157
158 // Store the output tile
159#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100160 const VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100161 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100162 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
163 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100165 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100166 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168
169#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
170#if defined(HAS_BIAS)
171 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 out10 += (DATA_TYPE)b;
173 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100174#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100175 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100176 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100177#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
178}
giuros013bfacb22019-04-01 12:07:02 +0100179
180#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
181 ({ \
182 col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
183 col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
184 })
185
186/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
187 *
188 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
189 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
190 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
191 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
192 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
193 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
194 *
195 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
196 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
197 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
198 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
199 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
200 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
201 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
202 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
203 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
204 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
205 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
206 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
207 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
208 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
209 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
210 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
211 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
212 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
213 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
214 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
215 */
216__kernel void winograd_output_transform_2x2_7x7_nhwc(
217 TENSOR4D_DECLARATION(src),
218 TENSOR4D_DECLARATION(dst),
219#if defined(HAS_BIAS)
220 VECTOR_DECLARATION(bias),
221#endif // defined(HAS_BIAS)
222 int dst_size)
223{
224 // Each thread stores a 4x4/4x1 or 1x4 tile
225#if defined(SRC_DEPTH)
226 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
227 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
228#else /* defined(SRC_DEPTH) */
229 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
230 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
231#endif /* defined(SRC_DEPTH) */
232
233 int y_in = get_global_id(1);
234 int x_out = get_global_id(0);
235 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
236 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
237#if defined(SRC_DEPTH)
238 int batch = get_global_id(2) / SRC_DEPTH;
239#endif /* defined(SRC_DEPTH) */
240
241#if defined(SRC_DEPTH)
242 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
243#else /* defined(SRC_DEPTH) */
244
245 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
246#endif /* defined(SRC_DEPTH) */
247
248 // Load the values across the channels to compose the input tile
249 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
250 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
251 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
252 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
253 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
254 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
255 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
256 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
257
258#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
259 // Compute out00, out01, out02 and out03
260 float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
261 float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
262
263#if defined(HAS_BIAS)
264 // Add bias
265 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
266
267 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
268
269 out00 += (float)b;
270 out01 += (float)b;
271#endif // defined(HAS_BIAS)
272
273 // Store the output tile
274#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
275 // Get output address
276#if defined(SRC_DEPTH)
277 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
278#else /* defined(SRC_DEPTH) */
279 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
280#endif /* defined(SRC_DEPTH) */
281 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
282
283 VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100284 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100285 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
286 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
287#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
288 // Get output address
289 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
290 VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100291 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,
292 B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100293 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
294 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
295#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
296
297#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
298
299 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
300 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
301 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
302 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
303 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
304 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
305 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
306 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
307
308 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
309 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
310 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
311 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
312 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
313 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
314 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
315 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
316
317 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
318 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
319 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
320 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
321 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
322 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
323 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
324 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
325
326 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
327 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
328 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
329 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
330 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
331 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
332 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
333 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
334
335 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
336 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
337 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
338 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
339 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
340 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
341 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
342 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
343
344 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
345 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
346 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
347 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
348 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
349 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
350 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
351 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
352
353 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
354 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
355 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
356 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
357 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
358 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
359 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
360 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
361
362 // Compute the 8x2 intermediate tensor
363 VEC_DATA_TYPE(float, 2)
364 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
365
366 COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
367 COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
368 COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
369 COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
370 COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
371 COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
372 COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
373 COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
374
375 // Compute the 2x2 output tile
376 VEC_DATA_TYPE(float, 2)
377 out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
378 VEC_DATA_TYPE(float, 2)
379 out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
380
381#if defined(HAS_BIAS)
382 // Add bias
383 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
384
385 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
386
387 out_col0 += (VEC_DATA_TYPE(float, 2))b;
388 out_col1 += (VEC_DATA_TYPE(float, 2))b;
389
390#endif // defined(HAS_BIAS)
391 // Get output address
392#if defined(SRC_DEPTH)
393 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
394#else /* defined(SRC_DEPTH) */
395 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
396#endif /* defined(SRC_DEPTH) */
397 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
398 int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
399
400 // Store the output tile
401 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +0100402 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100403 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +0100404 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100405
406 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
407 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
408
409 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
410 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
411
412#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
413}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100414#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100415
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100416#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100417/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
418 *
419 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
420 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
421 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
422 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
423 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100424 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100425 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100426 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100427 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
428 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
429 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
430 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
431 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
432 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100433 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
434 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100435 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
436 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
437 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
438 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
439 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
440 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100441 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
442 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
443 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
444 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100445 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
446 */
447__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100448 TENSOR4D_DECLARATION(src),
449 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100450#if defined(HAS_BIAS)
451 ,
452 VECTOR_DECLARATION(bias)
453#endif // defined(HAS_BIAS)
454)
455{
456 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000457#if defined(SRC_DEPTH)
458 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100459 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000460#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100461 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
462 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000463#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100464
465 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100466 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
467 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
468 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
469 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
470 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
471 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100472
473#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
474 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000475 float out00 = d00 + d01 + d02 + d03 + d04;
476 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
477 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
478 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100479#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100480
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100481 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
482 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
483 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
484 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
485 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
486 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100487
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100488 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
489 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
490 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
491 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
492 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
493 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100494
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100495 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
496 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
497 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
498 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
499 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
500 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100501
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100502 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
503 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
504 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
505 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
506 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
507 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100508
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100509 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
510 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
511 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
512 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
513 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
514 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100515
516 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000517 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
518 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
519 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
520 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100521
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000522 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
523 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100524
525 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
526 out01 += k1 - d02 - d12 - d22 - d32 - d42;
527 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
528 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
529
530 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000531 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
532 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
533 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
534 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100535
536 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
537 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
538
539 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
540 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
541 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
542 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
543
544 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000545 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
546 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
547 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
548 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100549
550 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
551 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
552
553 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
554 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
555 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
556 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
557
558 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000559 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
560 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
561 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
562 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100563
564 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
565 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
566
567 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
568 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
569 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
570 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
571#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
572
573 int y_in = get_global_id(1);
574 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
575 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
576 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000577#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100578 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000579#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100580
581#if defined(HAS_BIAS)
582 // Add bias
583 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
584
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000585 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100586
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000587 out00 += (float)b;
588 out01 += (float)b;
589 out02 += (float)b;
590 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100591#endif // defined(HAS_BIAS)
592
593 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000594#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100595 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000596#else /* defined(SRC_DEPTH) */
597 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
598#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100599
600 // Store the output tile
601#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100602 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100603 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
604 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100605 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
606 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
607 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
608 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100609#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100610 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100611 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100612#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
613
614#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
615#if defined(HAS_BIAS)
616 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000617 out10 += (float)b;
618 out11 += (float)b;
619 out12 += (float)b;
620 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100621
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000622 out20 += (float)b;
623 out21 += (float)b;
624 out22 += (float)b;
625 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100626
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000627 out30 += (float)b;
628 out31 += (float)b;
629 out32 += (float)b;
630 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100631#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100632 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100633 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100634 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100635 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100636 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100637 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100638#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
639}
640
Giorgio Arena149fdf32018-07-04 17:03:33 +0100641/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100642 *
643 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100644 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
645 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
646 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
647 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100648 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100649 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100650 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100651 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
652 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
653 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
654 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
655 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
656 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100657 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
658 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100659 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
660 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
661 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
662 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
663 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
664 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100665 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
666 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
667 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
668 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100669 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
670 * @param[in] dst_size Size of the destination tensor, minus the last padding
671 */
672__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100673 TENSOR4D_DECLARATION(src),
674 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100675#if defined(HAS_BIAS)
676 VECTOR_DECLARATION(bias),
677#endif // defined(HAS_BIAS)
678 int dst_size)
679{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100680 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000681#if defined(SRC_DEPTH)
682 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100683 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000684#else /* defined(SRC_DEPTH) */
685 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
686 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
687#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100688
Giorgio Arena149fdf32018-07-04 17:03:33 +0100689 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100690 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
691 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
692 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
693 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
694 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
695 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100696
Giorgio Arena149fdf32018-07-04 17:03:33 +0100697#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
698 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000699 float out00 = d00 + d01 + d02 + d03 + d04;
700 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
701 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
702 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100703#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
704
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100705 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
706 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
707 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
708 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
709 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
710 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100711
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100712 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
713 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
714 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
715 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
716 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
717 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100718
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100719 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
720 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
721 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
722 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
723 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
724 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100725
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100726 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
727 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
728 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
729 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
730 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
731 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100732
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100733 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
734 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
735 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
736 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
737 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
738 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100739
740 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000741 float out00 = d01 + d21 + d41 + d11 + d31;
742 float out01 = d01 + d21 + d41 + d11 + d31;
743 float out02 = d01 + d21 + d41 + d11 + d31;
744 float out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100745
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000746 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
747 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100748
749 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
750 out01 += k1 - d02 - d12 - d22 - d32 - d42;
751 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
752 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
753
754 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000755 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
756 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
757 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
758 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100759
760 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
761 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
762
763 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
764 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
765 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
766 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
767
768 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000769 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
770 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
771 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
772 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100773
774 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
775 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
776
777 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
778 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
779 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
780 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
781
782 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000783 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
784 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
785 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
786 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100787
788 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
789 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
790
791 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
792 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
793 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
794 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100795#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100796
797 int y_in = get_global_id(1);
798 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100799 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
800 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000801#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100802 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000803#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100804
805#if defined(HAS_BIAS)
806 // Add bias
807 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
808
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100809 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100810
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100811 out00 += (DATA_TYPE)b;
812 out01 += (DATA_TYPE)b;
813 out02 += (DATA_TYPE)b;
814 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100815#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100816 out10 += (DATA_TYPE)b;
817 out11 += (DATA_TYPE)b;
818 out12 += (DATA_TYPE)b;
819 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100820
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100821 out20 += (DATA_TYPE)b;
822 out21 += (DATA_TYPE)b;
823 out22 += (DATA_TYPE)b;
824 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100825
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100826 out30 += (DATA_TYPE)b;
827 out31 += (DATA_TYPE)b;
828 out32 += (DATA_TYPE)b;
829 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100830#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100831
832#endif // defined(HAS_BIAS)
833
Giorgio Arena149fdf32018-07-04 17:03:33 +0100834#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000835#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100836 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100837#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100838 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100839#endif /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000840 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100841
842 // Store the 1x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100843 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100844 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
845 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100846 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
847 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
848 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
849 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100850#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
851 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100852 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100853 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100854
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100855 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100856 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
Usama Arif6a98a6e2019-05-10 17:07:27 +0100857 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100858 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
859 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
860 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
861 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100862#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100863 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000864#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100865 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000866#else /* defined(SRC_DEPTH) */
867 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
868#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100869 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100870 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100871
872 // Store the 4x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100873 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100874 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100875 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100876 out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100877 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100878 out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100879 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100880 out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
881 VEC_DATA_TYPE(DATA_TYPE, 4)),
Usama Arif6a98a6e2019-05-10 17:07:27 +0100882 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100883 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
884 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
885 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
886 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
887 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
888 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
889 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
890 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
891 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
892 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
893 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
894 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
895 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
896 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
897 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
898 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100899
900#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100901}
902
903#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
904 ({ \
905 comm_fact.s0 = d1 + d2; \
906 comm_fact.s1 = d3 + d4; \
907 comm_fact.s2 = d5 + d6; \
908 \
909 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
910 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
911 \
912 comm_fact.s0 = d1 - d2; \
913 comm_fact.s1 = d3 - d4; \
914 comm_fact.s2 = d5 - d6; \
915 \
916 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
917 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
918 })
919
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100920/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100921 *
922 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100923 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
924 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
925 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
926 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100927 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100928 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100929 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100930 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
931 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
932 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
933 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
934 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
935 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100936 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
937 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100938 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
939 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
940 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
941 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
942 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
943 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100944 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
945 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
946 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
947 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100948 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
949 */
950__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100951 TENSOR4D_DECLARATION(src),
952 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100953#if defined(HAS_BIAS)
954 ,
955 VECTOR_DECLARATION(bias)
956#endif // defined(HAS_BIAS)
957)
958{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100959 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000960#if defined(SRC_DEPTH)
961 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100962 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000963#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100964
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000965 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
966 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
967#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100968
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100969 // Compute output address
970 int y_in = get_global_id(1);
971 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
972 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
973 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000974#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100975 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000976#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100977
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000978#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100979 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000980#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100981
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000982 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
983#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100984
985 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100986 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
987 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
988 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
989 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
990 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
991 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
992 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
993 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100994
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100995#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
996 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000997 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
998 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
999 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1000 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001001
1002#if defined(HAS_BIAS)
1003 // Add bias
1004 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1005
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001006 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001007
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001008 out00 += (DATA_TYPE)b;
1009 out01 += (DATA_TYPE)b;
1010 out02 += (DATA_TYPE)b;
1011 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001012#endif // defined(HAS_BIAS)
1013
1014 // Store the output tile
1015#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001016 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +01001017 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1018 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001019 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
1020 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
1021 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
1022 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001023#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +01001024 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001025 (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001026#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1027
1028#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001029
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001030 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1031 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1032 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1033 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1034 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1035 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1036 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1037 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001038
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001039 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1040 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1041 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1042 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1043 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1044 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1045 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1046 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001047
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001048 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1049 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1050 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1051 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1052 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1053 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1054 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1055 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001056
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001057 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1058 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1059 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1060 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1061 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1062 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1063 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1064 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001065
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001066 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1067 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1068 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1069 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1070 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1071 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1072 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1073 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001074
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001075 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1076 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1077 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1078 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1079 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1080 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1081 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1082 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001083
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001084 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1085 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1086 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1087 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1088 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1089 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1090 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1091 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001092
1093 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001094 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001095 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001096 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001097 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001098
1099 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1100 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1101 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1102 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1103 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1104 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1105 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1106 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1107
1108 // Compute the 4x4 output tile
1109 comm_fact0 = tmp_col1 + tmp_col2;
1110 comm_fact1 = tmp_col3 + tmp_col4;
1111 comm_fact2 = tmp_col5 + tmp_col6;
1112
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001113 VEC_DATA_TYPE(float, 4)
1114 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
1115 VEC_DATA_TYPE(float, 4)
1116 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001117
1118 comm_fact0 = tmp_col1 - tmp_col2;
1119 comm_fact1 = tmp_col3 - tmp_col4;
1120 comm_fact2 = tmp_col5 - tmp_col6;
1121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001122 VEC_DATA_TYPE(float, 4)
1123 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
1124 VEC_DATA_TYPE(float, 4)
1125 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001126
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001127#if defined(HAS_BIAS)
1128 // Add bias
1129 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1130
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001131 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001132
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001133 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1134 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1135 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1136 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001137#endif // defined(HAS_BIAS)
1138
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001139 // Store the output tile
Giorgio Arenad056e572020-10-12 11:53:51 +01001140 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001141 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001142 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001143 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001144 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001145 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001146 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001147 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001148#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001149}
1150
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001151/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001152 *
1153 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001154 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1155 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1156 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1157 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001158 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001159 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001160 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001161 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1162 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1163 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1164 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1165 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1166 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001167 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1168 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001169 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1170 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1171 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1172 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1173 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1174 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001175 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1176 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1177 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1178 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001179 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1180 */
1181__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001182 TENSOR4D_DECLARATION(src),
1183 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001184#if defined(HAS_BIAS)
1185 VECTOR_DECLARATION(bias),
1186#endif // defined(HAS_BIAS)
1187 int dst_size)
1188{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001189 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001190#if defined(SRC_DEPTH)
1191 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001192 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001193#else /* defined(SRC_DEPTH) */
1194 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1195 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
1196#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001197
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001198 int y_in = get_global_id(1);
1199 int x_out = get_global_id(0);
1200 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
1201 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001202#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001203 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001204#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001205
1206 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001207 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1208 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1209 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1210 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1211 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1212 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1213 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1214 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001215
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001216#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1217 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001218 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
1219 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
1220 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1221 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001222
1223#if defined(HAS_BIAS)
1224 // Add bias
1225 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1226
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001227 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001228
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001229 out00 += (float)b;
1230 out01 += (float)b;
1231 out02 += (float)b;
1232 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001233#endif // defined(HAS_BIAS)
1234
1235 // Store the output tile
1236#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1237 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001238#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001239 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001240#else /* defined(SRC_DEPTH) */
1241 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1242#endif /* defined(SRC_DEPTH) */
1243 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001244
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001245 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +01001246 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1247 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001248 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
1249 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
1250 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
1251 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001252#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1253 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001254 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001255 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +01001256 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001257 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001258 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
1259 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
1260 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
1261 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001262#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1263
1264#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1265
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001266 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1267 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1268 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1269 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1270 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1271 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1272 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1273 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001274
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001275 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1276 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1277 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1278 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1279 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1280 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1281 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1282 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001283
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001284 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1285 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1286 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1287 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1288 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1289 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1290 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1291 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001292
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001293 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1294 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1295 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1296 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1297 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1298 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1299 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1300 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001301
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001302 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1303 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1304 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1305 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1306 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1307 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1308 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1309 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001310
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001311 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1312 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1313 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1314 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1315 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1316 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1317 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1318 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001319
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001320 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1321 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1322 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1323 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1324 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1325 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1326 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1327 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001328
1329 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001330 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001331 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001332 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001333 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001334
1335 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1336 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1337 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1338 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1339 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1340 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1341 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1342 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1343
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001344 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001345 comm_fact0 = tmp_col1 + tmp_col2;
1346 comm_fact1 = tmp_col3 + tmp_col4;
1347 comm_fact2 = tmp_col5 + tmp_col6;
1348
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001349 VEC_DATA_TYPE(float, 4)
1350 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1351 VEC_DATA_TYPE(float, 4)
1352 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001353
1354 comm_fact0 = tmp_col1 - tmp_col2;
1355 comm_fact1 = tmp_col3 - tmp_col4;
1356 comm_fact2 = tmp_col5 - tmp_col6;
1357
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001358 VEC_DATA_TYPE(float, 4)
1359 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1360 VEC_DATA_TYPE(float, 4)
1361 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001362
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001363#if defined(HAS_BIAS)
1364 // Add bias
1365 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1366
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001367 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001368
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001369 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1370 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1371 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1372 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001373#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001374 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001375#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001376 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001377#else /* defined(SRC_DEPTH) */
1378 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1379#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001380 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1381 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001382
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001383 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001384 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001385 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001386 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001387 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001388 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001389 out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001390 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001391 out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001392
1393 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
1394 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
1395 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
1396 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
1397 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
1398 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
1399 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
1400 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
1401 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
1402 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
1403 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
1404 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
1405 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
1406 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
1407 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
1408 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001409#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001410}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001411#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001412
1413#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001414#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001415/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1416 *
1417 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1418 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1419 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1420 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001421 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001422 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001423 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001424 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1425 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1426 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1427 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1428 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1429 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001430 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1431 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001432 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1433 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1434 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1435 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1436 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1437 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001438 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1439 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1440 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1441 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001442 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1443 */
1444__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001445 TENSOR4D_DECLARATION(src),
1446 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001447#if defined(HAS_BIAS)
1448 ,
1449 VECTOR_DECLARATION(bias)
1450#endif // defined(HAS_BIAS)
1451)
1452{
1453 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1454 src_stride_x,
1455 src_step_x,
1456 src_stride_y,
1457 src_step_y,
1458 src_stride_z,
1459 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001460 src_stride_w,
1461 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001462 src_offset_first_element_in_bytes,
1463 dst_ptr,
1464 dst_stride_x,
1465 dst_step_x,
1466 dst_stride_y,
1467 dst_step_y,
1468 dst_stride_z,
1469 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001470 dst_stride_w,
1471 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001472 dst_offset_first_element_in_bytes
1473#if defined(HAS_BIAS)
1474 ,
1475 bias_ptr,
1476 bias_stride_x,
1477 bias_step_x,
1478 bias_offset_first_element_in_bytes
1479#endif // defined(HAS_BIAS)
1480 );
1481}
giuros013bfacb22019-04-01 12:07:02 +01001482
1483/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1484 *
1485 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1486 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1487 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1488 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1489 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1490 *
1491 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1492 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1493 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1494 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1495 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1496 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1497 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1498 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1499 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1500 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1501 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1502 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1503 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1504 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1505 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1506 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1507 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1508 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1509 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1510 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1511 */
1512__kernel void winograd_output_transform_2x1_7x1_nhwc(
1513 TENSOR4D_DECLARATION(src),
1514 TENSOR4D_DECLARATION(dst),
1515#if defined(HAS_BIAS)
1516 VECTOR_DECLARATION(bias),
1517#endif // defined(HAS_BIAS)
1518 int dst_size)
1519{
1520 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1521 src_stride_x,
1522 src_step_x,
1523 src_stride_y,
1524 src_step_y,
1525 src_stride_z,
1526 src_step_z,
1527 src_stride_w,
1528 src_step_w,
1529 src_offset_first_element_in_bytes,
1530 dst_ptr,
1531 dst_stride_x,
1532 dst_step_x,
1533 dst_stride_y,
1534 dst_step_y,
1535 dst_stride_z,
1536 dst_step_z,
1537 dst_stride_w,
1538 dst_step_w,
1539 dst_offset_first_element_in_bytes,
1540#if defined(HAS_BIAS)
1541 bias_ptr,
1542 bias_stride_x,
1543 bias_step_x,
1544 bias_offset_first_element_in_bytes,
1545#endif // defined(HAS_BIAS)
1546 dst_size);
1547}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001548#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001549
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001550#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001551/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1552 *
1553 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1554 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1555 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1556 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001557 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001558 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001559 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001560 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1561 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1562 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1563 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1564 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1565 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001566 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1567 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1569 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1570 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1572 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001574 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1575 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1576 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1577 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001578 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1579 */
1580__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001581 TENSOR4D_DECLARATION(src),
1582 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001583#if defined(HAS_BIAS)
1584 ,
1585 VECTOR_DECLARATION(bias)
1586#endif // defined(HAS_BIAS)
1587)
1588{
1589 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1590 src_stride_x,
1591 src_step_x,
1592 src_stride_y,
1593 src_step_y,
1594 src_stride_z,
1595 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001596 src_stride_w,
1597 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001598 src_offset_first_element_in_bytes,
1599 dst_ptr,
1600 dst_stride_x,
1601 dst_step_x,
1602 dst_stride_y,
1603 dst_step_y,
1604 dst_stride_z,
1605 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001606 dst_stride_w,
1607 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001608 dst_offset_first_element_in_bytes
1609#if defined(HAS_BIAS)
1610 ,
1611 bias_ptr,
1612 bias_stride_x,
1613 bias_step_x,
1614 bias_offset_first_element_in_bytes
1615#endif // defined(HAS_BIAS)
1616 );
1617}
1618
1619/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1620 *
1621 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1622 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1623 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1624 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001625 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001626 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001627 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001628 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1629 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1630 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1631 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1632 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1633 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001634 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1635 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001636 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1637 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1638 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1639 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1640 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1641 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001642 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1643 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1644 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1645 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001646 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1647 */
1648__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001649 TENSOR4D_DECLARATION(src),
1650 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001651#if defined(HAS_BIAS)
1652 ,
1653 VECTOR_DECLARATION(bias)
1654#endif // defined(HAS_BIAS)
1655)
1656{
1657 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1658 src_stride_x,
1659 src_step_x,
1660 src_stride_y,
1661 src_step_y,
1662 src_stride_z,
1663 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001664 src_stride_w,
1665 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001666 src_offset_first_element_in_bytes,
1667 dst_ptr,
1668 dst_stride_x,
1669 dst_step_x,
1670 dst_stride_y,
1671 dst_step_y,
1672 dst_stride_z,
1673 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001674 dst_stride_w,
1675 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001676 dst_offset_first_element_in_bytes
1677#if defined(HAS_BIAS)
1678 ,
1679 bias_ptr,
1680 bias_stride_x,
1681 bias_step_x,
1682 bias_offset_first_element_in_bytes
1683#endif // defined(HAS_BIAS)
1684 );
1685}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001686
1687/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1688 *
1689 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1690 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1691 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1692 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001693 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001694 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001695 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001696 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1697 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1698 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1699 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1700 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1701 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001702 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1703 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001704 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1705 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1706 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1707 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1708 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1709 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001710 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1711 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1712 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1713 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001714 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1715 */
1716__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001717 TENSOR4D_DECLARATION(src),
1718 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001719#if defined(HAS_BIAS)
1720 VECTOR_DECLARATION(bias),
1721#endif // defined(HAS_BIAS)
1722 int dst_size)
1723{
1724 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1725 src_stride_x,
1726 src_step_x,
1727 src_stride_y,
1728 src_step_y,
1729 src_stride_z,
1730 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001731 src_stride_w,
1732 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001733 src_offset_first_element_in_bytes,
1734 dst_ptr,
1735 dst_stride_x,
1736 dst_step_x,
1737 dst_stride_y,
1738 dst_step_y,
1739 dst_stride_z,
1740 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001741 dst_stride_w,
1742 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001743 dst_offset_first_element_in_bytes,
1744#if defined(HAS_BIAS)
1745 bias_ptr,
1746 bias_stride_x,
1747 bias_step_x,
1748 bias_offset_first_element_in_bytes,
1749#endif // defined(HAS_BIAS)
1750 dst_size);
1751}
1752
1753/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1754 *
1755 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1756 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1757 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1758 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001759 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001760 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001761 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001762 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1763 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1764 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1765 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1766 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1767 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001768 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1769 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001770 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1771 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1772 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1773 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1774 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1775 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001776 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1777 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1778 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1779 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001780 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1781 */
1782__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001783 TENSOR4D_DECLARATION(src),
1784 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001785#if defined(HAS_BIAS)
1786 VECTOR_DECLARATION(bias),
1787#endif // defined(HAS_BIAS)
1788 int dst_size)
1789{
1790 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1791 src_stride_x,
1792 src_step_x,
1793 src_stride_y,
1794 src_step_y,
1795 src_stride_z,
1796 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001797 src_stride_w,
1798 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001799 src_offset_first_element_in_bytes,
1800 dst_ptr,
1801 dst_stride_x,
1802 dst_step_x,
1803 dst_stride_y,
1804 dst_step_y,
1805 dst_stride_z,
1806 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001807 dst_stride_w,
1808 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001809 dst_offset_first_element_in_bytes,
1810#if defined(HAS_BIAS)
1811 bias_ptr,
1812 bias_stride_x,
1813 bias_step_x,
1814 bias_offset_first_element_in_bytes,
1815#endif // defined(HAS_BIAS)
1816 dst_size);
1817}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001818#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001819#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1820
1821#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001822#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001823/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1824 *
1825 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1826 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1827 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1828 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001829 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001830 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001831 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001832 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1833 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1834 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1835 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1836 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1837 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001838 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1839 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001840 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1841 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1842 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1843 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1844 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1845 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001846 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1847 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1848 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1849 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001850 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1851 */
1852__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001853 TENSOR4D_DECLARATION(src),
1854 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001855#if defined(HAS_BIAS)
1856 ,
1857 VECTOR_DECLARATION(bias)
1858#endif // defined(HAS_BIAS)
1859)
1860{
1861 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1862 src_stride_x,
1863 src_step_x,
1864 src_stride_y,
1865 src_step_y,
1866 src_stride_z,
1867 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001868 src_stride_w,
1869 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001870 src_offset_first_element_in_bytes,
1871 dst_ptr,
1872 dst_stride_x,
1873 dst_step_x,
1874 dst_stride_y,
1875 dst_step_y,
1876 dst_stride_z,
1877 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001878 dst_stride_w,
1879 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001880 dst_offset_first_element_in_bytes
1881#if defined(HAS_BIAS)
1882 ,
1883 bias_ptr,
1884 bias_stride_x,
1885 bias_step_x,
1886 bias_offset_first_element_in_bytes
1887#endif // defined(HAS_BIAS)
1888 );
1889}
giuros013bfacb22019-04-01 12:07:02 +01001890
1891/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1892 *
1893 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1894 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1895 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1896 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1897 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1898 *
1899 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1900 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1901 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1902 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1903 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1904 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1905 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1906 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1907 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1908 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1909 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1910 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1911 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1912 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1913 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1914 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1915 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1916 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1917 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1918 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1919 */
1920__kernel void winograd_output_transform_1x2_1x7_nhwc(
1921 TENSOR4D_DECLARATION(src),
1922 TENSOR4D_DECLARATION(dst),
1923#if defined(HAS_BIAS)
1924 VECTOR_DECLARATION(bias),
1925#endif // defined(HAS_BIAS)
1926 int dst_size)
1927{
1928 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1929 src_stride_x,
1930 src_step_x,
1931 src_stride_y,
1932 src_step_y,
1933 src_stride_z,
1934 src_step_z,
1935 src_stride_w,
1936 src_step_w,
1937 src_offset_first_element_in_bytes,
1938 dst_ptr,
1939 dst_stride_x,
1940 dst_step_x,
1941 dst_stride_y,
1942 dst_step_y,
1943 dst_stride_z,
1944 dst_step_z,
1945 dst_stride_w,
1946 dst_step_w,
1947 dst_offset_first_element_in_bytes,
1948#if defined(HAS_BIAS)
1949 bias_ptr,
1950 bias_stride_x,
1951 bias_step_x,
1952 bias_offset_first_element_in_bytes,
1953#endif // defined(HAS_BIAS)
1954 dst_size);
1955}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001956#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001957
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001958#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001959/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1960 *
1961 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1962 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1963 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1964 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001965 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001966 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001967 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001968 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1969 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1970 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1971 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1972 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1973 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001974 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1975 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001976 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1977 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1978 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1979 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1980 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1981 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001982 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1983 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1984 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1985 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001986 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1987 */
1988__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001989 TENSOR4D_DECLARATION(src),
1990 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001991#if defined(HAS_BIAS)
1992 ,
1993 VECTOR_DECLARATION(bias)
1994#endif // defined(HAS_BIAS)
1995)
1996{
1997 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1998 src_stride_x,
1999 src_step_x,
2000 src_stride_y,
2001 src_step_y,
2002 src_stride_z,
2003 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002004 src_stride_w,
2005 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002006 src_offset_first_element_in_bytes,
2007 dst_ptr,
2008 dst_stride_x,
2009 dst_step_x,
2010 dst_stride_y,
2011 dst_step_y,
2012 dst_stride_z,
2013 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002014 dst_stride_w,
2015 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002016 dst_offset_first_element_in_bytes
2017#if defined(HAS_BIAS)
2018 ,
2019 bias_ptr,
2020 bias_stride_x,
2021 bias_step_x,
2022 bias_offset_first_element_in_bytes
2023#endif // defined(HAS_BIAS)
2024 );
2025}
2026
2027/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
2028 *
2029 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2030 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2031 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2032 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002033 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002034 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002035 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002036 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2037 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2038 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2039 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2040 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2041 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002042 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2043 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002044 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2045 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2046 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2047 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2048 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2049 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002050 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2051 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2052 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2053 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002054 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2055 */
2056__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002057 TENSOR4D_DECLARATION(src),
2058 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002059#if defined(HAS_BIAS)
2060 ,
2061 VECTOR_DECLARATION(bias)
2062#endif // defined(HAS_BIAS)
2063)
2064{
2065 winograd_output_transform_4x4_5x5_nchw(src_ptr,
2066 src_stride_x,
2067 src_step_x,
2068 src_stride_y,
2069 src_step_y,
2070 src_stride_z,
2071 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002072 src_stride_w,
2073 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002074 src_offset_first_element_in_bytes,
2075 dst_ptr,
2076 dst_stride_x,
2077 dst_step_x,
2078 dst_stride_y,
2079 dst_step_y,
2080 dst_stride_z,
2081 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002082 dst_stride_w,
2083 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002084 dst_offset_first_element_in_bytes
2085#if defined(HAS_BIAS)
2086 ,
2087 bias_ptr,
2088 bias_stride_x,
2089 bias_step_x,
2090 bias_offset_first_element_in_bytes
2091#endif // defined(HAS_BIAS)
2092 );
2093}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002094
2095/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
2096 *
2097 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2098 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2099 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2100 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002101 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002102 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002103 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002104 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2105 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2106 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2107 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2108 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2109 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002110 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2111 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002112 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2113 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2114 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2115 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2116 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2117 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002118 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2119 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2120 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2121 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002122 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2123 */
2124__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002125 TENSOR4D_DECLARATION(src),
2126 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002127#if defined(HAS_BIAS)
2128 VECTOR_DECLARATION(bias),
2129#endif // defined(HAS_BIAS)
2130 int dst_size)
2131{
2132 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
2133 src_stride_x,
2134 src_step_x,
2135 src_stride_y,
2136 src_step_y,
2137 src_stride_z,
2138 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002139 src_stride_w,
2140 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002141 src_offset_first_element_in_bytes,
2142 dst_ptr,
2143 dst_stride_x,
2144 dst_step_x,
2145 dst_stride_y,
2146 dst_step_y,
2147 dst_stride_z,
2148 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002149 dst_stride_w,
2150 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002151 dst_offset_first_element_in_bytes,
2152#if defined(HAS_BIAS)
2153 bias_ptr,
2154 bias_stride_x,
2155 bias_step_x,
2156 bias_offset_first_element_in_bytes,
2157#endif // defined(HAS_BIAS)
2158 dst_size);
2159}
2160
2161/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
2162 *
2163 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2164 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2165 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2166 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002167 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002168 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002169 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002170 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2171 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2172 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2173 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2174 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2175 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002176 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2177 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002178 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2179 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2180 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2181 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2182 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2183 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002184 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2185 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2186 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2187 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002188 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2189 */
2190__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002191 TENSOR4D_DECLARATION(src),
2192 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002193#if defined(HAS_BIAS)
2194 VECTOR_DECLARATION(bias),
2195#endif // defined(HAS_BIAS)
2196 int dst_size)
2197{
2198 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
2199 src_stride_x,
2200 src_step_x,
2201 src_stride_y,
2202 src_step_y,
2203 src_stride_z,
2204 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002205 src_stride_w,
2206 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002207 src_offset_first_element_in_bytes,
2208 dst_ptr,
2209 dst_stride_x,
2210 dst_step_x,
2211 dst_stride_y,
2212 dst_step_y,
2213 dst_stride_z,
2214 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002215 dst_stride_w,
2216 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002217 dst_offset_first_element_in_bytes,
2218#if defined(HAS_BIAS)
2219 bias_ptr,
2220 bias_stride_x,
2221 bias_step_x,
2222 bias_offset_first_element_in_bytes,
2223#endif // defined(HAS_BIAS)
2224 dst_size);
2225}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002226#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002227#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002228#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)