blob: 99888edd204e9adfca01d32426107e0ce2180b18 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2018-2019 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottini0d0028c2018-10-02 16:41:52 +010027
Georgios Pinitasffb57a02018-10-29 18:01:52 +000028#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010030/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
36 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010037 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Usama Arif6a98a6e2019-05-10 17:07:27 +010038 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottini0d0028c2018-10-02 16:41:52 +010039 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
40 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010049 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
50 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010057 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
60 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 */
63__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010064 TENSOR4D_DECLARATION(src),
65 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010066#if defined(HAS_BIAS)
67 ,
68 VECTOR_DECLARATION(bias)
69#endif // defined(HAS_BIAS)
70)
71{
72 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000073#if defined(SRC_DEPTH)
74 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010075 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000076#else /* defined(SRC_DEPTH) */
77 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
78 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
79#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010082 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
83 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
84 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
85 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 // Compute the 2x1 or 1x2 output tile
89 // out00 = d00 + d01 + d02
90 // out01 = d01 - d02 - d03
91
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000092 float out00 = d00 + d01 + d02;
93 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010095
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010096 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
97 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
98 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
99 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100101 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
102 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
103 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
104 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100105
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100106 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
107 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
108 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
109 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100110
111 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000112 float k0 = d01 + d11 + d21;
113 float k1 = d02 + d12 + d22;
114 float k2 = d11 - d21 - d31;
115 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
118 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
119 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
120 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000122 float out00 = d10;
123 float out01 = -d13;
124 float out10 = d10;
125 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100126
127 out00 += d00 + d20 + k0 + k1;
128 out01 += k0 - k1 - (d03 + d23);
129 out10 += -d20 - d30 + k2 + k3;
130 out11 += k2 - k3 + d23 + d33;
131#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132
133 int y_in = get_global_id(1);
134 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
135 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
136 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000137#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100138 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000139#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140
141#if defined(HAS_BIAS)
142 // Add bias
143 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
144
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000145 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000147 out00 += (float)b;
148 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
151 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000152#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100153 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#else /* defined(SRC_DEPTH) */
155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
156#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157
158 // Store the output tile
159#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100160 const const VEC_DATA_TYPE(DATA_TYPE, 2)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100161 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100162 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
163 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100165 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
166 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168
169#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
170#if defined(HAS_BIAS)
171 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 out10 += (DATA_TYPE)b;
173 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100174#endif // defined(HAS_BIAS)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100175 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
176 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100177#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
178}
giuros013bfacb22019-04-01 12:07:02 +0100179
180#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
181 ({ \
182 col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
183 col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
184 })
185
186/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
187 *
188 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
189 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
190 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
191 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
192 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
193 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
194 *
195 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
196 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
197 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
198 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
199 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
200 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
201 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
202 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
203 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
204 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
205 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
206 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
207 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
208 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
209 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
210 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
211 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
212 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
213 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
214 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
215 */
216__kernel void winograd_output_transform_2x2_7x7_nhwc(
217 TENSOR4D_DECLARATION(src),
218 TENSOR4D_DECLARATION(dst),
219#if defined(HAS_BIAS)
220 VECTOR_DECLARATION(bias),
221#endif // defined(HAS_BIAS)
222 int dst_size)
223{
224 // Each thread stores a 4x4/4x1 or 1x4 tile
225#if defined(SRC_DEPTH)
226 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
227 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
228#else /* defined(SRC_DEPTH) */
229 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
230 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
231#endif /* defined(SRC_DEPTH) */
232
233 int y_in = get_global_id(1);
234 int x_out = get_global_id(0);
235 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
236 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
237#if defined(SRC_DEPTH)
238 int batch = get_global_id(2) / SRC_DEPTH;
239#endif /* defined(SRC_DEPTH) */
240
241#if defined(SRC_DEPTH)
242 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
243#else /* defined(SRC_DEPTH) */
244
245 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
246#endif /* defined(SRC_DEPTH) */
247
248 // Load the values across the channels to compose the input tile
249 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
250 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
251 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
252 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
253 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
254 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
255 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
256 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
257
258#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
259 // Compute out00, out01, out02 and out03
260 float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
261 float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
262
263#if defined(HAS_BIAS)
264 // Add bias
265 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
266
267 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
268
269 out00 += (float)b;
270 out01 += (float)b;
271#endif // defined(HAS_BIAS)
272
273 // Store the output tile
274#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
275 // Get output address
276#if defined(SRC_DEPTH)
277 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
278#else /* defined(SRC_DEPTH) */
279 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
280#endif /* defined(SRC_DEPTH) */
281 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
282
283 VEC_DATA_TYPE(DATA_TYPE, 2)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100284 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100285 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
286 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
287#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
288 // Get output address
289 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
290 VEC_DATA_TYPE(DATA_TYPE, 2)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100291 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100292 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
293 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
294#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
295
296#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
297
298 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
299 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
300 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
301 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
302 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
303 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
304 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
305 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
306
307 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
308 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
309 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
310 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
311 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
312 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
313 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
314 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
315
316 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
317 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
318 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
319 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
320 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
321 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
322 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
323 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
324
325 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
326 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
327 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
328 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
329 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
330 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
331 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
332 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
333
334 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
335 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
336 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
337 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
338 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
339 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
340 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
341 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
342
343 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
344 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
345 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
346 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
347 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
348 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
349 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
350 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
351
352 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
353 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
354 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
355 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
356 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
357 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
358 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
359 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
360
361 // Compute the 8x2 intermediate tensor
362 VEC_DATA_TYPE(float, 2)
363 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
364
365 COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
366 COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
367 COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
368 COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
369 COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
370 COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
371 COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
372 COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
373
374 // Compute the 2x2 output tile
375 VEC_DATA_TYPE(float, 2)
376 out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
377 VEC_DATA_TYPE(float, 2)
378 out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
379
380#if defined(HAS_BIAS)
381 // Add bias
382 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
383
384 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
385
386 out_col0 += (VEC_DATA_TYPE(float, 2))b;
387 out_col1 += (VEC_DATA_TYPE(float, 2))b;
388
389#endif // defined(HAS_BIAS)
390 // Get output address
391#if defined(SRC_DEPTH)
392 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
393#else /* defined(SRC_DEPTH) */
394 int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
395#endif /* defined(SRC_DEPTH) */
396 offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
397 int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
398
399 // Store the output tile
400 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100401 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100402 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100403 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100404
405 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
406 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
407
408 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
409 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
410
411#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
412}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100413#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100414
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100415#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100416/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
417 *
418 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
419 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
420 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
421 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
422 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100423 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100424 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100425 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100426 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
427 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
429 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
430 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
431 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100432 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
433 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100434 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
435 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
436 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
437 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
438 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
439 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100440 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
441 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
442 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
443 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100444 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
445 */
446__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100447 TENSOR4D_DECLARATION(src),
448 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100449#if defined(HAS_BIAS)
450 ,
451 VECTOR_DECLARATION(bias)
452#endif // defined(HAS_BIAS)
453)
454{
455 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000456#if defined(SRC_DEPTH)
457 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100458 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000459#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100460 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
461 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000462#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100463
464 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100465 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
466 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
467 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
468 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
469 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
470 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100471
472#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
473 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000474 float out00 = d00 + d01 + d02 + d03 + d04;
475 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
476 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
477 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100478#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100479
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100480 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
481 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
482 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
483 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
484 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
485 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100486
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100487 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
488 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
489 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
490 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
491 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
492 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100493
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100494 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
495 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
496 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
497 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
498 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
499 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100500
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100501 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
502 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
503 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
504 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
505 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
506 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100507
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100508 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
509 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
510 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
511 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
512 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
513 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100514
515 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000516 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
517 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
518 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
519 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100520
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000521 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
522 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100523
524 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
525 out01 += k1 - d02 - d12 - d22 - d32 - d42;
526 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
527 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
528
529 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000530 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
531 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
532 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
533 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100534
535 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
536 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
537
538 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
539 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
540 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
541 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
542
543 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000544 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
545 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
546 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
547 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100548
549 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
550 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
551
552 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
553 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
554 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
555 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
556
557 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000558 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
559 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
560 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
561 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100562
563 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
564 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
565
566 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
567 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
568 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
569 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
570#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
571
572 int y_in = get_global_id(1);
573 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
574 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
575 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000576#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100577 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000578#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100579
580#if defined(HAS_BIAS)
581 // Add bias
582 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
583
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000584 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100585
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000586 out00 += (float)b;
587 out01 += (float)b;
588 out02 += (float)b;
589 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100590#endif // defined(HAS_BIAS)
591
592 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000593#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100594 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000595#else /* defined(SRC_DEPTH) */
596 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
597#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100598
599 // Store the output tile
600#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100601 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100602 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100603 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
604 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
605 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
606 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100607#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100608 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
609 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100610#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
611
612#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
613#if defined(HAS_BIAS)
614 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000615 out10 += (float)b;
616 out11 += (float)b;
617 out12 += (float)b;
618 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100619
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000620 out20 += (float)b;
621 out21 += (float)b;
622 out22 += (float)b;
623 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100624
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000625 out30 += (float)b;
626 out31 += (float)b;
627 out32 += (float)b;
628 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100629#endif // defined(HAS_BIAS)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100630 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
631 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
632 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
633 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
634 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
635 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100636#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
637}
638
Giorgio Arena149fdf32018-07-04 17:03:33 +0100639/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100640 *
641 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100642 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
643 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
644 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
645 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100646 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100647 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100648 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100649 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
650 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
651 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
652 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
653 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
654 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100655 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
656 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100657 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
658 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
659 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
660 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
662 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100663 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
664 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
665 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
666 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100667 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
668 * @param[in] dst_size Size of the destination tensor, minus the last padding
669 */
670__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100671 TENSOR4D_DECLARATION(src),
672 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100673#if defined(HAS_BIAS)
674 VECTOR_DECLARATION(bias),
675#endif // defined(HAS_BIAS)
676 int dst_size)
677{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100678 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000679#if defined(SRC_DEPTH)
680 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100681 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000682#else /* defined(SRC_DEPTH) */
683 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
684 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
685#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100686
Giorgio Arena149fdf32018-07-04 17:03:33 +0100687 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100688 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
689 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
690 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
691 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
692 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
693 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100694
Giorgio Arena149fdf32018-07-04 17:03:33 +0100695#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
696 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000697 float out00 = d00 + d01 + d02 + d03 + d04;
698 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
699 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
700 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100701#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
702
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100703 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
704 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
705 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
706 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
707 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
708 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100709
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100710 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
711 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
712 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
713 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
714 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
715 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100716
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100717 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
718 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
719 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
720 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
721 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
722 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100723
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
725 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
726 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
727 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
728 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
729 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100730
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100731 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
732 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
733 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
734 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
735 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
736 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100737
738 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000739 float out00 = d01 + d21 + d41 + d11 + d31;
740 float out01 = d01 + d21 + d41 + d11 + d31;
741 float out02 = d01 + d21 + d41 + d11 + d31;
742 float out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100743
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000744 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
745 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100746
747 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
748 out01 += k1 - d02 - d12 - d22 - d32 - d42;
749 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
750 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
751
752 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000753 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
754 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
755 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
756 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100757
758 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
759 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
760
761 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
762 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
763 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
764 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
765
766 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000767 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
768 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
769 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
770 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100771
772 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
773 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
774
775 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
776 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
777 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
778 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
779
780 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000781 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
782 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
783 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
784 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100785
786 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
787 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
788
789 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
790 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
791 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
792 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100793#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100794
795 int y_in = get_global_id(1);
796 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100797 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
798 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000799#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100800 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000801#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100802
803#if defined(HAS_BIAS)
804 // Add bias
805 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
806
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100807 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100808
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100809 out00 += (DATA_TYPE)b;
810 out01 += (DATA_TYPE)b;
811 out02 += (DATA_TYPE)b;
812 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100813#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100814 out10 += (DATA_TYPE)b;
815 out11 += (DATA_TYPE)b;
816 out12 += (DATA_TYPE)b;
817 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100818
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100819 out20 += (DATA_TYPE)b;
820 out21 += (DATA_TYPE)b;
821 out22 += (DATA_TYPE)b;
822 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100823
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100824 out30 += (DATA_TYPE)b;
825 out31 += (DATA_TYPE)b;
826 out32 += (DATA_TYPE)b;
827 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100828#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100829
830#endif // defined(HAS_BIAS)
831
Giorgio Arena149fdf32018-07-04 17:03:33 +0100832#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000833#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100834 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100835#else /* defined(SRC_DEPTH) */
giuros013bfacb22019-04-01 12:07:02 +0100836 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100837#endif /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000838 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100839
840 // Store the 1x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100841 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100842 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100843 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
844 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
845 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
846 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100847#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
848 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100849 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100850 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100851
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100852 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100853 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
854 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100855 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
856 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
857 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
858 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100859#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100860 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000861#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100862 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000863#else /* defined(SRC_DEPTH) */
864 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
865#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100866 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100867 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100868
869 // Store the 4x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100870 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100871 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100872 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100873 out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100874 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100875 out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100876 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +0100877 out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
878 VEC_DATA_TYPE(DATA_TYPE, 4)),
879 A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100880 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
881 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
882 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
883 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
884 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
885 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
886 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
887 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
888 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
889 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
890 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
891 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
892 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
893 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
894 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
895 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100896
897#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100898}
899
900#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
901 ({ \
902 comm_fact.s0 = d1 + d2; \
903 comm_fact.s1 = d3 + d4; \
904 comm_fact.s2 = d5 + d6; \
905 \
906 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
907 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
908 \
909 comm_fact.s0 = d1 - d2; \
910 comm_fact.s1 = d3 - d4; \
911 comm_fact.s2 = d5 - d6; \
912 \
913 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
914 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
915 })
916
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100917/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100918 *
919 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100920 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
921 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
922 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
923 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100924 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100925 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100926 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100927 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
928 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
929 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
930 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
931 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
932 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100933 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
934 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100935 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
936 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
937 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
938 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
939 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
940 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100941 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
942 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
943 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
944 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100945 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
946 */
947__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100948 TENSOR4D_DECLARATION(src),
949 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100950#if defined(HAS_BIAS)
951 ,
952 VECTOR_DECLARATION(bias)
953#endif // defined(HAS_BIAS)
954)
955{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100956 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000957#if defined(SRC_DEPTH)
958 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100959 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000960#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100961
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000962 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
963 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
964#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100965
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100966 // Compute output address
967 int y_in = get_global_id(1);
968 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
969 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
970 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000971#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100972 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000973#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100974
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000975#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100976 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000977#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100978
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000979 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
980#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100981
982 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100983 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
984 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
985 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
986 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
987 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
988 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
989 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
990 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100991
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100992#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
993 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000994 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
995 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
996 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
997 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100998
999#if defined(HAS_BIAS)
1000 // Add bias
1001 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1002
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001003 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001004
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001005 out00 += (DATA_TYPE)b;
1006 out01 += (DATA_TYPE)b;
1007 out02 += (DATA_TYPE)b;
1008 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001009#endif // defined(HAS_BIAS)
1010
1011 // Store the output tile
1012#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001013 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001014 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001015 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
1016 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
1017 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
1018 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001019#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001020 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
1021 (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001022#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1023
1024#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001025
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001026 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1027 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1028 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1029 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1030 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1031 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1032 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1033 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001034
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001035 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1036 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1037 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1038 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1039 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1040 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1041 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1042 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001043
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001044 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1045 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1046 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1047 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1048 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1049 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1050 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1051 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001052
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001053 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1054 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1055 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1056 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1057 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1058 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1059 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1060 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001061
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001062 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1063 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1064 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1065 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1066 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1067 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1068 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1069 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001070
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001071 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1072 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1073 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1074 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1075 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1076 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1077 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1078 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001079
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001080 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1081 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1082 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1083 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1084 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1085 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1086 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1087 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001088
1089 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001090 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001091 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001092 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001093 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001094
1095 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1096 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1097 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1098 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1099 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1100 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1101 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1102 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1103
1104 // Compute the 4x4 output tile
1105 comm_fact0 = tmp_col1 + tmp_col2;
1106 comm_fact1 = tmp_col3 + tmp_col4;
1107 comm_fact2 = tmp_col5 + tmp_col6;
1108
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001109 VEC_DATA_TYPE(float, 4)
1110 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
1111 VEC_DATA_TYPE(float, 4)
1112 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001113
1114 comm_fact0 = tmp_col1 - tmp_col2;
1115 comm_fact1 = tmp_col3 - tmp_col4;
1116 comm_fact2 = tmp_col5 - tmp_col6;
1117
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001118 VEC_DATA_TYPE(float, 4)
1119 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
1120 VEC_DATA_TYPE(float, 4)
1121 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001122
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001123#if defined(HAS_BIAS)
1124 // Add bias
1125 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1126
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001127 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001128
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001129 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1130 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1131 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1132 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001133#endif // defined(HAS_BIAS)
1134
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001135 // Store the output tile
Usama Arif6a98a6e2019-05-10 17:07:27 +01001136 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
1137 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
1138 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
1139 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
1140 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
1141 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
1142 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
1143 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001144#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001145}
1146
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001147/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001148 *
1149 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001150 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1151 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1152 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1153 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001154 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001155 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001156 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001157 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1158 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1159 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1160 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1161 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1162 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001163 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1164 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001165 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1166 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1167 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1168 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1169 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1170 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001171 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1172 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1173 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1174 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001175 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1176 */
1177__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001178 TENSOR4D_DECLARATION(src),
1179 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001180#if defined(HAS_BIAS)
1181 VECTOR_DECLARATION(bias),
1182#endif // defined(HAS_BIAS)
1183 int dst_size)
1184{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001185 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001186#if defined(SRC_DEPTH)
1187 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001188 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001189#else /* defined(SRC_DEPTH) */
1190 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1191 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
1192#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001193
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001194 int y_in = get_global_id(1);
1195 int x_out = get_global_id(0);
1196 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
1197 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001198#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001199 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001200#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001201
1202 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001203 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1204 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1205 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1206 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1207 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1208 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1209 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1210 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001211
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001212#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1213 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001214 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
1215 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
1216 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1217 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001218
1219#if defined(HAS_BIAS)
1220 // Add bias
1221 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1222
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001223 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001224
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001225 out00 += (float)b;
1226 out01 += (float)b;
1227 out02 += (float)b;
1228 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001229#endif // defined(HAS_BIAS)
1230
1231 // Store the output tile
1232#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1233 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001234#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001235 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001236#else /* defined(SRC_DEPTH) */
1237 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1238#endif /* defined(SRC_DEPTH) */
1239 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001240
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001241 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001242 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001243 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
1244 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
1245 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
1246 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001247#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1248 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001249 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001250 VEC_DATA_TYPE(DATA_TYPE, 4)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001251 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1252 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001253 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
1254 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
1255 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
1256 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001257#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1258
1259#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1260
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001261 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1262 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1263 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1264 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1265 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1266 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1267 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1268 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001269
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001270 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1271 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1272 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1273 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1274 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1275 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1276 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1277 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001278
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001279 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1280 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1281 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1282 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1283 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1284 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1285 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1286 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001287
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001288 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1289 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1290 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1291 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1292 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1293 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1294 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1295 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001296
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001297 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1298 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1299 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1300 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1301 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1302 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1303 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1304 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001305
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001306 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1307 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1308 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1309 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1310 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1311 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1312 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1313 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001314
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001315 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1316 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1317 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1318 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1319 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1320 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1321 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1322 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001323
1324 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001325 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001326 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001327 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001328 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001329
1330 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1331 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1332 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1333 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1334 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1335 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1336 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1337 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1338
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001339 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001340 comm_fact0 = tmp_col1 + tmp_col2;
1341 comm_fact1 = tmp_col3 + tmp_col4;
1342 comm_fact2 = tmp_col5 + tmp_col6;
1343
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001344 VEC_DATA_TYPE(float, 4)
1345 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1346 VEC_DATA_TYPE(float, 4)
1347 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001348
1349 comm_fact0 = tmp_col1 - tmp_col2;
1350 comm_fact1 = tmp_col3 - tmp_col4;
1351 comm_fact2 = tmp_col5 - tmp_col6;
1352
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001353 VEC_DATA_TYPE(float, 4)
1354 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1355 VEC_DATA_TYPE(float, 4)
1356 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001357
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001358#if defined(HAS_BIAS)
1359 // Add bias
1360 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1361
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001362 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001363
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001364 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1365 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1366 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1367 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001368#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001369 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001370#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001371 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001372#else /* defined(SRC_DEPTH) */
1373 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1374#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001375 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1376 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001377
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001378 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001379 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001380 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001381 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001382 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001383 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001384 out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001385 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001386 out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001387
1388 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
1389 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
1390 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
1391 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
1392 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
1393 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
1394 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
1395 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
1396 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
1397 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
1398 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
1399 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
1400 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
1401 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
1402 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
1403 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001404#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001405}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001406#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001407
1408#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001409#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001410/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1411 *
1412 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1413 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1414 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1415 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001416 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001417 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001418 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001419 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1420 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1422 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1423 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1424 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001425 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1426 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001427 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1428 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1429 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1430 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1431 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1432 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001433 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1434 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1435 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1436 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001437 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1438 */
1439__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001440 TENSOR4D_DECLARATION(src),
1441 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001442#if defined(HAS_BIAS)
1443 ,
1444 VECTOR_DECLARATION(bias)
1445#endif // defined(HAS_BIAS)
1446)
1447{
1448 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1449 src_stride_x,
1450 src_step_x,
1451 src_stride_y,
1452 src_step_y,
1453 src_stride_z,
1454 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001455 src_stride_w,
1456 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001457 src_offset_first_element_in_bytes,
1458 dst_ptr,
1459 dst_stride_x,
1460 dst_step_x,
1461 dst_stride_y,
1462 dst_step_y,
1463 dst_stride_z,
1464 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001465 dst_stride_w,
1466 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001467 dst_offset_first_element_in_bytes
1468#if defined(HAS_BIAS)
1469 ,
1470 bias_ptr,
1471 bias_stride_x,
1472 bias_step_x,
1473 bias_offset_first_element_in_bytes
1474#endif // defined(HAS_BIAS)
1475 );
1476}
giuros013bfacb22019-04-01 12:07:02 +01001477
1478/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1479 *
1480 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1481 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1482 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1483 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1484 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1485 *
1486 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1487 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1488 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1489 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1490 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1491 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1492 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1493 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1494 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1495 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1496 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1497 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1498 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1499 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1500 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1501 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1502 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1503 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1504 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1505 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1506 */
1507__kernel void winograd_output_transform_2x1_7x1_nhwc(
1508 TENSOR4D_DECLARATION(src),
1509 TENSOR4D_DECLARATION(dst),
1510#if defined(HAS_BIAS)
1511 VECTOR_DECLARATION(bias),
1512#endif // defined(HAS_BIAS)
1513 int dst_size)
1514{
1515 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1516 src_stride_x,
1517 src_step_x,
1518 src_stride_y,
1519 src_step_y,
1520 src_stride_z,
1521 src_step_z,
1522 src_stride_w,
1523 src_step_w,
1524 src_offset_first_element_in_bytes,
1525 dst_ptr,
1526 dst_stride_x,
1527 dst_step_x,
1528 dst_stride_y,
1529 dst_step_y,
1530 dst_stride_z,
1531 dst_step_z,
1532 dst_stride_w,
1533 dst_step_w,
1534 dst_offset_first_element_in_bytes,
1535#if defined(HAS_BIAS)
1536 bias_ptr,
1537 bias_stride_x,
1538 bias_step_x,
1539 bias_offset_first_element_in_bytes,
1540#endif // defined(HAS_BIAS)
1541 dst_size);
1542}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001543#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001544
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001545#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001546/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1547 *
1548 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1549 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1550 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1551 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001552 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001553 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001554 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001555 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1556 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1557 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1558 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1559 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1560 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001561 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1562 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001563 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1564 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1565 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1566 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1567 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1568 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001569 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1570 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1571 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1572 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001573 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1574 */
1575__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001576 TENSOR4D_DECLARATION(src),
1577 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001578#if defined(HAS_BIAS)
1579 ,
1580 VECTOR_DECLARATION(bias)
1581#endif // defined(HAS_BIAS)
1582)
1583{
1584 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1585 src_stride_x,
1586 src_step_x,
1587 src_stride_y,
1588 src_step_y,
1589 src_stride_z,
1590 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001591 src_stride_w,
1592 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001593 src_offset_first_element_in_bytes,
1594 dst_ptr,
1595 dst_stride_x,
1596 dst_step_x,
1597 dst_stride_y,
1598 dst_step_y,
1599 dst_stride_z,
1600 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001601 dst_stride_w,
1602 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001603 dst_offset_first_element_in_bytes
1604#if defined(HAS_BIAS)
1605 ,
1606 bias_ptr,
1607 bias_stride_x,
1608 bias_step_x,
1609 bias_offset_first_element_in_bytes
1610#endif // defined(HAS_BIAS)
1611 );
1612}
1613
1614/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1615 *
1616 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1617 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1618 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1619 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001620 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001621 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001622 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001623 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1624 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1625 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1626 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1627 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1628 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001629 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1630 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001631 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1632 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1633 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1634 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1635 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1636 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001637 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1638 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1639 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1640 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001641 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1642 */
1643__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001644 TENSOR4D_DECLARATION(src),
1645 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001646#if defined(HAS_BIAS)
1647 ,
1648 VECTOR_DECLARATION(bias)
1649#endif // defined(HAS_BIAS)
1650)
1651{
1652 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1653 src_stride_x,
1654 src_step_x,
1655 src_stride_y,
1656 src_step_y,
1657 src_stride_z,
1658 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001659 src_stride_w,
1660 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001661 src_offset_first_element_in_bytes,
1662 dst_ptr,
1663 dst_stride_x,
1664 dst_step_x,
1665 dst_stride_y,
1666 dst_step_y,
1667 dst_stride_z,
1668 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001669 dst_stride_w,
1670 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001671 dst_offset_first_element_in_bytes
1672#if defined(HAS_BIAS)
1673 ,
1674 bias_ptr,
1675 bias_stride_x,
1676 bias_step_x,
1677 bias_offset_first_element_in_bytes
1678#endif // defined(HAS_BIAS)
1679 );
1680}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001681
1682/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1683 *
1684 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1685 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1686 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1687 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001688 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001689 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001690 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001691 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1692 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1693 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1694 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1695 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1696 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001697 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1698 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001699 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1700 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1701 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1702 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1703 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1704 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001705 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1706 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1707 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1708 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001709 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1710 */
1711__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001712 TENSOR4D_DECLARATION(src),
1713 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001714#if defined(HAS_BIAS)
1715 VECTOR_DECLARATION(bias),
1716#endif // defined(HAS_BIAS)
1717 int dst_size)
1718{
1719 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1720 src_stride_x,
1721 src_step_x,
1722 src_stride_y,
1723 src_step_y,
1724 src_stride_z,
1725 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001726 src_stride_w,
1727 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001728 src_offset_first_element_in_bytes,
1729 dst_ptr,
1730 dst_stride_x,
1731 dst_step_x,
1732 dst_stride_y,
1733 dst_step_y,
1734 dst_stride_z,
1735 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001736 dst_stride_w,
1737 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001738 dst_offset_first_element_in_bytes,
1739#if defined(HAS_BIAS)
1740 bias_ptr,
1741 bias_stride_x,
1742 bias_step_x,
1743 bias_offset_first_element_in_bytes,
1744#endif // defined(HAS_BIAS)
1745 dst_size);
1746}
1747
1748/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1749 *
1750 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1751 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1752 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1753 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001754 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001755 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001756 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001757 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1758 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1759 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1760 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1761 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1762 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001763 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1764 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001765 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1766 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1767 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1768 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1769 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1770 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001771 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1772 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1773 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1774 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001775 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1776 */
1777__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001778 TENSOR4D_DECLARATION(src),
1779 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001780#if defined(HAS_BIAS)
1781 VECTOR_DECLARATION(bias),
1782#endif // defined(HAS_BIAS)
1783 int dst_size)
1784{
1785 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1786 src_stride_x,
1787 src_step_x,
1788 src_stride_y,
1789 src_step_y,
1790 src_stride_z,
1791 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001792 src_stride_w,
1793 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001794 src_offset_first_element_in_bytes,
1795 dst_ptr,
1796 dst_stride_x,
1797 dst_step_x,
1798 dst_stride_y,
1799 dst_step_y,
1800 dst_stride_z,
1801 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001802 dst_stride_w,
1803 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001804 dst_offset_first_element_in_bytes,
1805#if defined(HAS_BIAS)
1806 bias_ptr,
1807 bias_stride_x,
1808 bias_step_x,
1809 bias_offset_first_element_in_bytes,
1810#endif // defined(HAS_BIAS)
1811 dst_size);
1812}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001813#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001814#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1815
1816#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001817#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001818/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1819 *
1820 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1821 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1822 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1823 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001824 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001825 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001826 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001827 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1828 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1829 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1830 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1831 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1832 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001833 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1834 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001835 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1836 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1837 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1838 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1839 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1840 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001841 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1842 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1843 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1844 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001845 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1846 */
1847__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001848 TENSOR4D_DECLARATION(src),
1849 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001850#if defined(HAS_BIAS)
1851 ,
1852 VECTOR_DECLARATION(bias)
1853#endif // defined(HAS_BIAS)
1854)
1855{
1856 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1857 src_stride_x,
1858 src_step_x,
1859 src_stride_y,
1860 src_step_y,
1861 src_stride_z,
1862 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001863 src_stride_w,
1864 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001865 src_offset_first_element_in_bytes,
1866 dst_ptr,
1867 dst_stride_x,
1868 dst_step_x,
1869 dst_stride_y,
1870 dst_step_y,
1871 dst_stride_z,
1872 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001873 dst_stride_w,
1874 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001875 dst_offset_first_element_in_bytes
1876#if defined(HAS_BIAS)
1877 ,
1878 bias_ptr,
1879 bias_stride_x,
1880 bias_step_x,
1881 bias_offset_first_element_in_bytes
1882#endif // defined(HAS_BIAS)
1883 );
1884}
giuros013bfacb22019-04-01 12:07:02 +01001885
1886/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1887 *
1888 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1889 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1890 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1891 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1892 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1893 *
1894 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1895 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1896 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1897 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1898 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1899 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1900 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1901 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1902 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1903 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1904 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1905 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1906 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1907 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1908 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1909 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1910 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1911 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1912 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1913 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1914 */
1915__kernel void winograd_output_transform_1x2_1x7_nhwc(
1916 TENSOR4D_DECLARATION(src),
1917 TENSOR4D_DECLARATION(dst),
1918#if defined(HAS_BIAS)
1919 VECTOR_DECLARATION(bias),
1920#endif // defined(HAS_BIAS)
1921 int dst_size)
1922{
1923 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1924 src_stride_x,
1925 src_step_x,
1926 src_stride_y,
1927 src_step_y,
1928 src_stride_z,
1929 src_step_z,
1930 src_stride_w,
1931 src_step_w,
1932 src_offset_first_element_in_bytes,
1933 dst_ptr,
1934 dst_stride_x,
1935 dst_step_x,
1936 dst_stride_y,
1937 dst_step_y,
1938 dst_stride_z,
1939 dst_step_z,
1940 dst_stride_w,
1941 dst_step_w,
1942 dst_offset_first_element_in_bytes,
1943#if defined(HAS_BIAS)
1944 bias_ptr,
1945 bias_stride_x,
1946 bias_step_x,
1947 bias_offset_first_element_in_bytes,
1948#endif // defined(HAS_BIAS)
1949 dst_size);
1950}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001951#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001952
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001953#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001954/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1955 *
1956 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1957 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1958 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1959 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001960 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001961 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001962 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001963 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1964 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1965 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1966 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1967 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1968 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001969 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1970 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001971 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1972 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1973 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1974 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1975 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1976 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001977 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1978 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1979 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1980 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001981 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1982 */
1983__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001984 TENSOR4D_DECLARATION(src),
1985 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001986#if defined(HAS_BIAS)
1987 ,
1988 VECTOR_DECLARATION(bias)
1989#endif // defined(HAS_BIAS)
1990)
1991{
1992 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1993 src_stride_x,
1994 src_step_x,
1995 src_stride_y,
1996 src_step_y,
1997 src_stride_z,
1998 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001999 src_stride_w,
2000 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002001 src_offset_first_element_in_bytes,
2002 dst_ptr,
2003 dst_stride_x,
2004 dst_step_x,
2005 dst_stride_y,
2006 dst_step_y,
2007 dst_stride_z,
2008 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002009 dst_stride_w,
2010 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002011 dst_offset_first_element_in_bytes
2012#if defined(HAS_BIAS)
2013 ,
2014 bias_ptr,
2015 bias_stride_x,
2016 bias_step_x,
2017 bias_offset_first_element_in_bytes
2018#endif // defined(HAS_BIAS)
2019 );
2020}
2021
2022/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
2023 *
2024 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2025 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2026 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2027 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002028 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002029 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002030 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002031 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2032 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2033 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2034 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2035 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2036 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002037 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2038 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002039 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2040 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2041 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2042 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2043 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2044 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002045 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2046 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2047 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2048 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002049 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2050 */
2051__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002052 TENSOR4D_DECLARATION(src),
2053 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002054#if defined(HAS_BIAS)
2055 ,
2056 VECTOR_DECLARATION(bias)
2057#endif // defined(HAS_BIAS)
2058)
2059{
2060 winograd_output_transform_4x4_5x5_nchw(src_ptr,
2061 src_stride_x,
2062 src_step_x,
2063 src_stride_y,
2064 src_step_y,
2065 src_stride_z,
2066 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002067 src_stride_w,
2068 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002069 src_offset_first_element_in_bytes,
2070 dst_ptr,
2071 dst_stride_x,
2072 dst_step_x,
2073 dst_stride_y,
2074 dst_step_y,
2075 dst_stride_z,
2076 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002077 dst_stride_w,
2078 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002079 dst_offset_first_element_in_bytes
2080#if defined(HAS_BIAS)
2081 ,
2082 bias_ptr,
2083 bias_stride_x,
2084 bias_step_x,
2085 bias_offset_first_element_in_bytes
2086#endif // defined(HAS_BIAS)
2087 );
2088}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002089
2090/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
2091 *
2092 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2093 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2094 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2095 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002096 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002097 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002098 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002099 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2100 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2101 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2102 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2103 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2104 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002105 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2106 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002107 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2108 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2109 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2110 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2111 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2112 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002113 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2114 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2115 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2116 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002117 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2118 */
2119__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002120 TENSOR4D_DECLARATION(src),
2121 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002122#if defined(HAS_BIAS)
2123 VECTOR_DECLARATION(bias),
2124#endif // defined(HAS_BIAS)
2125 int dst_size)
2126{
2127 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
2128 src_stride_x,
2129 src_step_x,
2130 src_stride_y,
2131 src_step_y,
2132 src_stride_z,
2133 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002134 src_stride_w,
2135 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002136 src_offset_first_element_in_bytes,
2137 dst_ptr,
2138 dst_stride_x,
2139 dst_step_x,
2140 dst_stride_y,
2141 dst_step_y,
2142 dst_stride_z,
2143 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002144 dst_stride_w,
2145 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002146 dst_offset_first_element_in_bytes,
2147#if defined(HAS_BIAS)
2148 bias_ptr,
2149 bias_stride_x,
2150 bias_step_x,
2151 bias_offset_first_element_in_bytes,
2152#endif // defined(HAS_BIAS)
2153 dst_size);
2154}
2155
2156/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
2157 *
2158 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2159 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2160 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2161 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002162 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002163 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002164 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002165 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2166 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2167 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2168 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2169 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2170 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002171 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2172 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002173 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2174 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2175 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2176 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2177 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2178 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002179 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2180 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2181 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2182 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002183 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2184 */
2185__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002186 TENSOR4D_DECLARATION(src),
2187 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002188#if defined(HAS_BIAS)
2189 VECTOR_DECLARATION(bias),
2190#endif // defined(HAS_BIAS)
2191 int dst_size)
2192{
2193 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
2194 src_stride_x,
2195 src_step_x,
2196 src_stride_y,
2197 src_step_y,
2198 src_stride_z,
2199 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002200 src_stride_w,
2201 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002202 src_offset_first_element_in_bytes,
2203 dst_ptr,
2204 dst_stride_x,
2205 dst_step_x,
2206 dst_stride_y,
2207 dst_step_y,
2208 dst_stride_z,
2209 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002210 dst_stride_w,
2211 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002212 dst_offset_first_element_in_bytes,
2213#if defined(HAS_BIAS)
2214 bias_ptr,
2215 bias_stride_x,
2216 bias_step_x,
2217 bias_offset_first_element_in_bytes,
2218#endif // defined(HAS_BIAS)
2219 dst_size);
2220}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002221#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002222#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002223#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)