blob: 1b39f62d9774cc7e5e2637ea94402991588ced88 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Gian Marco Iodicea8903c82021-03-24 14:48:22 +00002 * Copyright (c) 2018-2021 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif6a98a6e2019-05-10 17:07:27 +010024#include "activation_float_helpers.h"
Gian Marco Iodicea8903c82021-03-24 14:48:22 +000025#include "helpers.h"
26#include "tile_helpers.h"
Manuel Bottini0d0028c2018-10-02 16:41:52 +010027
Georgios Pinitasffb57a02018-10-29 18:01:52 +000028#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010030/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
36 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010037 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Usama Arif6a98a6e2019-05-10 17:07:27 +010038 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottini0d0028c2018-10-02 16:41:52 +010039 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
40 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010049 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
50 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010057 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
60 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 */
63__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010064 TENSOR4D_DECLARATION(src),
65 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010066#if defined(HAS_BIAS)
67 ,
68 VECTOR_DECLARATION(bias)
69#endif // defined(HAS_BIAS)
70)
71{
72 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000073#if defined(SRC_DEPTH)
74 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010075 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000076#else /* defined(SRC_DEPTH) */
77 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
78 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
79#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010082 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
83 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
84 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
85 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 // Compute the 2x1 or 1x2 output tile
89 // out00 = d00 + d01 + d02
90 // out01 = d01 - d02 - d03
91
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000092 float out00 = d00 + d01 + d02;
93 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010095
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010096 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
97 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
98 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
99 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100101 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
102 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
103 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
104 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100105
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100106 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
107 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
108 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
109 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100110
111 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000112 float k0 = d01 + d11 + d21;
113 float k1 = d02 + d12 + d22;
114 float k2 = d11 - d21 - d31;
115 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
118 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
119 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
120 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000122 float out00 = d10;
123 float out01 = -d13;
124 float out10 = d10;
125 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100126
127 out00 += d00 + d20 + k0 + k1;
128 out01 += k0 - k1 - (d03 + d23);
129 out10 += -d20 - d30 + k2 + k3;
130 out11 += k2 - k3 + d23 + d33;
131#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132
133 int y_in = get_global_id(1);
134 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
135 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
136 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000137#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100138 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000139#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140
141#if defined(HAS_BIAS)
142 // Add bias
143 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
144
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000145 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000147 out00 += (float)b;
148 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
151 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000152#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100153 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#else /* defined(SRC_DEPTH) */
155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
156#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157
158 // Store the output tile
159#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100160 const VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100161 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100162 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
163 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100165 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100166 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168
169#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
170#if defined(HAS_BIAS)
171 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 out10 += (DATA_TYPE)b;
173 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100174#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100175 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100176 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100177#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
178}
giuros013bfacb22019-04-01 12:07:02 +0100179
180#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
181 ({ \
182 col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
183 col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
184 })
185
186/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
187 *
188 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
189 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
190 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100191 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
192 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +0100193 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
194 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
195 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
196 *
197 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
198 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
199 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
200 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
201 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
202 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
203 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
204 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
205 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
206 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
207 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
208 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
209 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
210 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
211 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
212 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
213 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
214 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
215 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
216 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
217 */
218__kernel void winograd_output_transform_2x2_7x7_nhwc(
219 TENSOR4D_DECLARATION(src),
220 TENSOR4D_DECLARATION(dst),
221#if defined(HAS_BIAS)
222 VECTOR_DECLARATION(bias),
223#endif // defined(HAS_BIAS)
224 int dst_size)
225{
226 // Each thread stores a 4x4/4x1 or 1x4 tile
227#if defined(SRC_DEPTH)
228 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
229 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
230#else /* defined(SRC_DEPTH) */
231 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
232 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
233#endif /* defined(SRC_DEPTH) */
234
235 int y_in = get_global_id(1);
236 int x_out = get_global_id(0);
237 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
238 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
239#if defined(SRC_DEPTH)
240 int batch = get_global_id(2) / SRC_DEPTH;
241#endif /* defined(SRC_DEPTH) */
242
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100243 __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
giuros013bfacb22019-04-01 12:07:02 +0100244
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100245#if defined(SRC_DEPTH)
246 dst_base_ptr += batch * dst_stride_w;
247#endif // defined(SRC_DEPTH)
giuros013bfacb22019-04-01 12:07:02 +0100248
249 // Load the values across the channels to compose the input tile
250 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
251 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
252 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
253 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
254 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
255 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
256 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
257 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
258
259#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
260 // Compute out00, out01, out02 and out03
261 float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
262 float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
263
264#if defined(HAS_BIAS)
265 // Add bias
266 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
267
268 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
269
270 out00 += (float)b;
271 out01 += (float)b;
272#endif // defined(HAS_BIAS)
273
274 // Store the output tile
275#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100276
277 dst_base_ptr += y_out * dst_stride_y;
278
279 int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
giuros013bfacb22019-04-01 12:07:02 +0100280
281 VEC_DATA_TYPE(DATA_TYPE, 2)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000282 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100283
284 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
285 // is overwritten with the valid one
286 *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s1) = out0_dt.s1;
287 *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s0) = out0_dt.s0;
giuros013bfacb22019-04-01 12:07:02 +0100288#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100289
290 dst_base_ptr += z_out * dst_stride_z;
291
292 int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
293
giuros013bfacb22019-04-01 12:07:02 +0100294 VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100295 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,
296 B_VAL);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100297
298 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
299 // is overwritten with the valid one
300 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1) = out0_dt.s1;
301 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0) = out0_dt.s0;
giuros013bfacb22019-04-01 12:07:02 +0100302#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
303
304#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
305
306 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
307 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
308 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
309 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
310 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
311 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
312 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
313 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
314
315 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
316 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
317 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
318 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
319 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
320 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
321 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
322 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
323
324 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
325 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
326 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
327 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
328 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
329 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
330 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
331 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
332
333 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
334 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
335 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
336 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
337 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
338 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
339 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
340 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
341
342 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
343 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
344 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
345 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
346 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
347 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
348 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
349 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
350
351 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
352 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
353 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
354 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
355 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
356 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
357 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
358 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
359
360 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
361 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
362 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
363 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
364 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
365 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
366 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
367 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
368
369 // Compute the 8x2 intermediate tensor
370 VEC_DATA_TYPE(float, 2)
371 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
372
373 COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
374 COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
375 COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
376 COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
377 COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
378 COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
379 COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
380 COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
381
382 // Compute the 2x2 output tile
383 VEC_DATA_TYPE(float, 2)
384 out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
385 VEC_DATA_TYPE(float, 2)
386 out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
387
388#if defined(HAS_BIAS)
389 // Add bias
390 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
391
392 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
393
394 out_col0 += (VEC_DATA_TYPE(float, 2))b;
395 out_col1 += (VEC_DATA_TYPE(float, 2))b;
396
397#endif // defined(HAS_BIAS)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100398
399 int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
400 int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
giuros013bfacb22019-04-01 12:07:02 +0100401
402 // Store the output tile
403 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +0100404 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100405 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +0100406 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
giuros013bfacb22019-04-01 12:07:02 +0100407
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100408 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
409 // is overwritten with the valid one
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000410 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1) = out_col1_dt.s1;
411 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0) = out_col1_dt.s0;
412 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1) = out_col0_dt.s1;
413 *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0) = out_col0_dt.s0;
giuros013bfacb22019-04-01 12:07:02 +0100414
415#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
416}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100417#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100418
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100419#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100420/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
421 *
422 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
423 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
424 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
425 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
426 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100427 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100428 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100429 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100430 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
431 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
432 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
433 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
434 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
435 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100436 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
437 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100438 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
439 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
440 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
441 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
442 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
443 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100444 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
445 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
446 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
447 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100448 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
449 */
450__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100451 TENSOR4D_DECLARATION(src),
452 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100453#if defined(HAS_BIAS)
454 ,
455 VECTOR_DECLARATION(bias)
456#endif // defined(HAS_BIAS)
457)
458{
459 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000460#if defined(SRC_DEPTH)
461 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100462 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000463#else /* defined(SRC_DEPTH) */
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000464 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
465 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000466#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100467
468 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100469 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
470 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
471 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
472 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
473 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
474 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100475
476#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
477 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000478 float out00 = d00 + d01 + d02 + d03 + d04;
479 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
480 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
481 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100482#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100483
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100484 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
485 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
486 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
487 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
488 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
489 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100490
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100491 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
492 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
493 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
494 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
495 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
496 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100497
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100498 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
499 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
500 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
501 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
502 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
503 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100505 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
506 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
507 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
508 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
509 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
510 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100511
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100512 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
513 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
514 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
515 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
516 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
517 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100518
519 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000520 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
521 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
522 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
523 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100524
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000525 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
526 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100527
528 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
529 out01 += k1 - d02 - d12 - d22 - d32 - d42;
530 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
531 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
532
533 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000534 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
535 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
536 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
537 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100538
539 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
540 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
541
542 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
543 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
544 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
545 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
546
547 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000548 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
549 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
550 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
551 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100552
553 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
554 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
555
556 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
557 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
558 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
559 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
560
561 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000562 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
563 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
564 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
565 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100566
567 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
568 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
569
570 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
571 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
572 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
573 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
574#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
575
576 int y_in = get_global_id(1);
577 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
578 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
579 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000580#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100581 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000582#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100583
584#if defined(HAS_BIAS)
585 // Add bias
586 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
587
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000588 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100589
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000590 out00 += (float)b;
591 out01 += (float)b;
592 out02 += (float)b;
593 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100594#endif // defined(HAS_BIAS)
595
596 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000597#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100598 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000599#else /* defined(SRC_DEPTH) */
600 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
601#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100602
603 // Store the output tile
604#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100605 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100606 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
607 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100608 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
609 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
610 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
611 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100612#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100613 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100614 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100615#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
616
617#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
618#if defined(HAS_BIAS)
619 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000620 out10 += (float)b;
621 out11 += (float)b;
622 out12 += (float)b;
623 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100624
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000625 out20 += (float)b;
626 out21 += (float)b;
627 out22 += (float)b;
628 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100629
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000630 out30 += (float)b;
631 out31 += (float)b;
632 out32 += (float)b;
633 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100634#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100635 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100636 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100637 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100638 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100639 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100640 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100641#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
642}
643
Giorgio Arena149fdf32018-07-04 17:03:33 +0100644/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100645 *
646 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100647 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
648 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000649 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100650 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
651 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Giorgio Arena149fdf32018-07-04 17:03:33 +0100652 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
653 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100654 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000655 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100656 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100657 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100658 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
659 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
660 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
661 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
662 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
663 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100664 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
665 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100666 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
667 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
668 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
669 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
670 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
671 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100672 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
673 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
674 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
675 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100676 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
677 * @param[in] dst_size Size of the destination tensor, minus the last padding
678 */
679__kernel void winograd_output_transform_4x4_3x3_nhwc(
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000680 TENSOR4D(src, BUFFER),
681 TENSOR4D(dst, BUFFER),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100682#if defined(HAS_BIAS)
683 VECTOR_DECLARATION(bias),
684#endif // defined(HAS_BIAS)
685 int dst_size)
686{
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000687 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
688 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
689 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100690
Giorgio Arena149fdf32018-07-04 17:03:33 +0100691#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100692
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000693 TILE(DATA_TYPE, 6, N0, in) = { { 0 } };
694 TILE(DATA_TYPE, 4, N0, out) = { { 0 } };
695 TILE(uint, 6, 1, src_indirect_y) = { { 0 } };
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100696
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000697 LOOP_UNROLLING(int, i, 0, 6, 1)
698 {
699 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
700 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
701 }
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100702
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000703 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
704 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100705
706 // Compute out00, out01, out02 and out03
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000707 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
708 out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
709 out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
710 out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100711
712#if defined(HAS_BIAS)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000713 TILE(DATA_TYPE, 1, N0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100714
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000715 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100716
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000717 // c = c + bias[broadcasted]
718 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
719#endif // HAS_BIAS
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100720
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000721 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
722 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100723
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000724 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100725
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000726 TILE(uint, 4, 1, dst_indirect_y) = { { 0 } };
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100727
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000728 // Calculate the destination indirect Y
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100729#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000730 LOOP_UNROLLING(int, yk, 0, 4, 1)
731 {
732 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
733 dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
734 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
735 }
736#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
737 LOOP_UNROLLING(int, xk, 0, 4, 1)
738 {
739 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
740 dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
741 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
742 }
743#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100744
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000745 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
746 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100747
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000748#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100749
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000750 // Calculate the indirect Y for the source tensor
751 TILE(DATA_TYPE, 36, N0, in) = { { 0 } };
752 TILE(DATA_TYPE, 4, N0, tmp) = { { 0 } };
753 TILE(uint, 36, 1, src_indirect_y) = { { 0 } };
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100754
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000755 LOOP_UNROLLING(int, i, 0, 36, 1)
756 {
757 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
758 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
759 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100760
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000761 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
762 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100763
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000764 LOOP_UNROLLING(int, i, 0, 6, 1)
765 {
766 tmp[0].v = in[6 + i].v + in[12 + i].v;
767 tmp[1].v = in[6 + i].v - in[12 + i].v;
768 tmp[2].v = in[18 + i].v + in[24 + i].v;
769 tmp[3].v = in[18 + i].v - in[24 + i].v;
770 tmp[3].v = tmp[3].v + tmp[3].v;
771 in[i].v = in[i].v + tmp[0].v + tmp[2].v;
772 in[6 + i].v = tmp[3].v + tmp[1].v;
773 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
774 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
775 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100776
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000777 // Compute the output tile
778 TILE(DATA_TYPE, 16, N0, out) = { { 0 } };
Giorgio Arena149fdf32018-07-04 17:03:33 +0100779
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000780 LOOP_UNROLLING(int, i, 0, 4, 1)
781 {
782 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
783 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
784 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
785 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
786 tmp[3].v = tmp[3].v + tmp[3].v;
787 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
788 out[4 * i + 1].v = tmp[3].v + tmp[1].v;
789 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
790 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
791 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100792
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000793#if defined(HAS_BIAS)
794 TILE(DATA_TYPE, 1, N0, b);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100795
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000796 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100797
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000798 // c = c + bias[broadcasted]
799 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
800#endif // HAS_BIAS
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100801
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000802 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
803 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100804
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000805 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
806
807 TILE(uint, 16, 1, dst_indirect_y) = { { 0 } };
808
809 // Calculate the destination indirect Y
810 LOOP_UNROLLING(int, yk, 0, 4, 1)
811 {
812 LOOP_UNROLLING(int, xk, 0, 4, 1)
813 {
814 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
815 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
816 dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
817 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
818 }
819 }
820
821 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
822 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
823#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100824}
825
826#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
827 ({ \
828 comm_fact.s0 = d1 + d2; \
829 comm_fact.s1 = d3 + d4; \
830 comm_fact.s2 = d5 + d6; \
831 \
832 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
833 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
834 \
835 comm_fact.s0 = d1 - d2; \
836 comm_fact.s1 = d3 - d4; \
837 comm_fact.s2 = d5 - d6; \
838 \
839 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
840 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
841 })
842
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100843/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100844 *
845 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100846 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
847 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
848 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
849 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100850 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100851 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100852 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100853 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
854 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
855 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
856 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
857 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
858 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100859 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
860 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100861 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
862 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
863 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
864 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
865 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
866 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100867 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
868 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
869 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
870 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100871 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
872 */
873__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100874 TENSOR4D_DECLARATION(src),
875 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100876#if defined(HAS_BIAS)
877 ,
878 VECTOR_DECLARATION(bias)
879#endif // defined(HAS_BIAS)
880)
881{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100882 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000883#if defined(SRC_DEPTH)
884 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100885 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000886#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100887
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000888 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
889 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
890#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100891
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100892 // Compute output address
893 int y_in = get_global_id(1);
894 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
895 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
896 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000897#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100898 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000899#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100900
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000901#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100902 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000903#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100904
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000905 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
906#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100907
908 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100909 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
910 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
911 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
912 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
913 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
914 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
915 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
916 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100917
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100918#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
919 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000920 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
921 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
922 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
923 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100924
925#if defined(HAS_BIAS)
926 // Add bias
927 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
928
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000929 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100930
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100931 out00 += (DATA_TYPE)b;
932 out01 += (DATA_TYPE)b;
933 out02 += (DATA_TYPE)b;
934 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100935#endif // defined(HAS_BIAS)
936
937 // Store the output tile
938#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100939 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100940 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
941 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100942 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
943 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
944 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
945 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100946#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100947 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100948 (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100949#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
950
951#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100952
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000953 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
954 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
955 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
956 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
957 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
958 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
959 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
960 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100961
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100962 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
963 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
964 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
965 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
966 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
967 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
968 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
969 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100970
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100971 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
972 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
973 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
974 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
975 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
976 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
977 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
978 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100979
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100980 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
981 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
982 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
983 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
984 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
985 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
986 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
987 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100988
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100989 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
990 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
991 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
992 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
993 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
994 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
995 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
996 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100997
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100998 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
999 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1000 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1001 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1002 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1003 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1004 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1005 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001006
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001007 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1008 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1009 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1010 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1011 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1012 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1013 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1014 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001015
1016 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001017 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001018 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001019 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001020 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001021
1022 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1023 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1024 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1025 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1026 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1027 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1028 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1029 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1030
1031 // Compute the 4x4 output tile
1032 comm_fact0 = tmp_col1 + tmp_col2;
1033 comm_fact1 = tmp_col3 + tmp_col4;
1034 comm_fact2 = tmp_col5 + tmp_col6;
1035
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001036 VEC_DATA_TYPE(float, 4)
1037 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
1038 VEC_DATA_TYPE(float, 4)
1039 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001040
1041 comm_fact0 = tmp_col1 - tmp_col2;
1042 comm_fact1 = tmp_col3 - tmp_col4;
1043 comm_fact2 = tmp_col5 - tmp_col6;
1044
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001045 VEC_DATA_TYPE(float, 4)
1046 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
1047 VEC_DATA_TYPE(float, 4)
1048 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001049
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001050#if defined(HAS_BIAS)
1051 // Add bias
1052 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1053
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001054 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001055
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001056 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1057 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1058 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1059 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001060#endif // defined(HAS_BIAS)
1061
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001062 // Store the output tile
Giorgio Arenad056e572020-10-12 11:53:51 +01001063 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001064 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001065 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001066 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001067 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001068 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +01001069 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001070 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001071#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001072}
1073
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001074/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001075 *
1076 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001077 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1078 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001079 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1080 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001081 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1082 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001083 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001084 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001085 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001086 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1087 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1088 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1089 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1090 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1091 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001092 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1093 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001094 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1095 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1096 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1097 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1098 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1099 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001100 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1101 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1102 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1103 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001104 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1105 */
1106__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001107 TENSOR4D_DECLARATION(src),
1108 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001109#if defined(HAS_BIAS)
1110 VECTOR_DECLARATION(bias),
1111#endif // defined(HAS_BIAS)
1112 int dst_size)
1113{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001114 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001115#if defined(SRC_DEPTH)
1116 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001117 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001118#else /* defined(SRC_DEPTH) */
1119 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1120 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
1121#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001122
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001123 int y_in = get_global_id(1);
1124 int x_out = get_global_id(0);
1125 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
1126 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001127#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001128 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001129#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001130
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001131 __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
1132
1133#if defined(SRC_DEPTH)
1134 dst_base_ptr += batch * dst_stride_w;
1135#endif // defined(SRC_DEPTH)
1136
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001137 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001138 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1139 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1140 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1141 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1142 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1143 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1144 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1145 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001146
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001147#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1148 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001149 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
1150 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
1151 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
1152 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001153
1154#if defined(HAS_BIAS)
1155 // Add bias
1156 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1157
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001158 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001159
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001160 out00 += (float)b;
1161 out01 += (float)b;
1162 out02 += (float)b;
1163 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001164#endif // defined(HAS_BIAS)
1165
1166 // Store the output tile
1167#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001168
1169 dst_base_ptr += y_out * dst_stride_y;
1170
1171 int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001172
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001173 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +01001174 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
1175 B_VAL);
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001176
1177 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
1178 // is overwritten with the valid one
1179 *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;
1180 *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;
1181 *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;
1182 *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001183#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001184
1185 dst_base_ptr += z_out * dst_stride_z;
1186
1187 int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
1188
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001189 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +01001190 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
Usama Arif6a98a6e2019-05-10 17:07:27 +01001191 B_VAL);
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001192
1193 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
1194 // is overwritten with the valid one
1195 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;
1196 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;
1197 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;
1198 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001199#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1200
1201#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1202
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001203 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1204 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1205 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1206 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1207 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1208 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1209 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1210 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001211
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001212 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1213 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1214 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1215 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1216 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1217 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1218 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1219 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001220
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001221 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1222 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1223 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1224 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1225 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1226 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1227 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1228 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001229
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001230 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1231 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1232 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1233 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1234 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1235 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1236 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1237 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001238
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001239 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1240 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1241 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1242 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1243 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1244 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1245 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1246 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001247
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001248 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1249 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1250 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1251 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1252 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1253 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1254 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1255 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001256
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001257 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1258 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1259 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1260 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1261 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1262 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1263 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1264 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001265
1266 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001267 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001268 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001269 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001270 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001271
1272 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1273 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1274 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1275 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1276 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1277 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1278 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1279 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1280
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001281 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001282 comm_fact0 = tmp_col1 + tmp_col2;
1283 comm_fact1 = tmp_col3 + tmp_col4;
1284 comm_fact2 = tmp_col5 + tmp_col6;
1285
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001286 VEC_DATA_TYPE(float, 4)
1287 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1288 VEC_DATA_TYPE(float, 4)
1289 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001290
1291 comm_fact0 = tmp_col1 - tmp_col2;
1292 comm_fact1 = tmp_col3 - tmp_col4;
1293 comm_fact2 = tmp_col5 - tmp_col6;
1294
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001295 VEC_DATA_TYPE(float, 4)
1296 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1297 VEC_DATA_TYPE(float, 4)
1298 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001299
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001300#if defined(HAS_BIAS)
1301 // Add bias
1302 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1303
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001304 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001305
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001306 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1307 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1308 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1309 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001310#endif // defined(HAS_BIAS)
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001311
1312 int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
1313 int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001314
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001315 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001316 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001317 out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001318 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001319 out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001320 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001321 out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001322 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001323 out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001324
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001325 // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
1326 // is overwritten with the valid one
1327 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out_col3_dt.s3;
1328 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out_col2_dt.s3;
1329 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out_col1_dt.s3;
1330 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out_col0_dt.s3;
1331 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out_col3_dt.s2;
1332 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out_col2_dt.s2;
1333 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out_col1_dt.s2;
1334 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out_col0_dt.s2;
1335 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out_col3_dt.s1;
1336 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out_col2_dt.s1;
1337 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out_col1_dt.s1;
1338 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out_col0_dt.s1;
1339 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out_col3_dt.s0;
1340 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out_col2_dt.s0;
1341 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out_col1_dt.s0;
1342 *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out_col0_dt.s0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001343#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001344}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001345#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001346
1347#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001348#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001349/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1350 *
1351 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1352 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1353 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1354 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001355 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001356 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001357 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001358 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1359 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1360 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1361 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1362 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1363 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001364 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1365 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001366 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1367 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1368 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1369 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1370 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1371 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001372 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1373 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1374 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1375 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001376 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1377 */
1378__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001379 TENSOR4D_DECLARATION(src),
1380 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001381#if defined(HAS_BIAS)
1382 ,
1383 VECTOR_DECLARATION(bias)
1384#endif // defined(HAS_BIAS)
1385)
1386{
1387 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1388 src_stride_x,
1389 src_step_x,
1390 src_stride_y,
1391 src_step_y,
1392 src_stride_z,
1393 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001394 src_stride_w,
1395 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001396 src_offset_first_element_in_bytes,
1397 dst_ptr,
1398 dst_stride_x,
1399 dst_step_x,
1400 dst_stride_y,
1401 dst_step_y,
1402 dst_stride_z,
1403 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001404 dst_stride_w,
1405 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001406 dst_offset_first_element_in_bytes
1407#if defined(HAS_BIAS)
1408 ,
1409 bias_ptr,
1410 bias_stride_x,
1411 bias_step_x,
1412 bias_offset_first_element_in_bytes
1413#endif // defined(HAS_BIAS)
1414 );
1415}
giuros013bfacb22019-04-01 12:07:02 +01001416
1417/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1418 *
1419 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1420 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1421 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001422 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1423 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +01001424 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1425 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1426 *
1427 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1428 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1429 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1430 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1431 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1432 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1433 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1434 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1435 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1436 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1437 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1438 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1439 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1440 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1441 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1442 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1443 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1444 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1445 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1446 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1447 */
1448__kernel void winograd_output_transform_2x1_7x1_nhwc(
1449 TENSOR4D_DECLARATION(src),
1450 TENSOR4D_DECLARATION(dst),
1451#if defined(HAS_BIAS)
1452 VECTOR_DECLARATION(bias),
1453#endif // defined(HAS_BIAS)
1454 int dst_size)
1455{
1456 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1457 src_stride_x,
1458 src_step_x,
1459 src_stride_y,
1460 src_step_y,
1461 src_stride_z,
1462 src_step_z,
1463 src_stride_w,
1464 src_step_w,
1465 src_offset_first_element_in_bytes,
1466 dst_ptr,
1467 dst_stride_x,
1468 dst_step_x,
1469 dst_stride_y,
1470 dst_step_y,
1471 dst_stride_z,
1472 dst_step_z,
1473 dst_stride_w,
1474 dst_step_w,
1475 dst_offset_first_element_in_bytes,
1476#if defined(HAS_BIAS)
1477 bias_ptr,
1478 bias_stride_x,
1479 bias_step_x,
1480 bias_offset_first_element_in_bytes,
1481#endif // defined(HAS_BIAS)
1482 dst_size);
1483}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001484#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001485
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001486#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001487/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1488 *
1489 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1490 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1491 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1492 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001493 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001494 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001495 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001496 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1497 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1498 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1499 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1500 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1501 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001502 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1503 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001504 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1505 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1506 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1507 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1508 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1509 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001510 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1511 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1512 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1513 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001514 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1515 */
1516__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001517 TENSOR4D_DECLARATION(src),
1518 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001519#if defined(HAS_BIAS)
1520 ,
1521 VECTOR_DECLARATION(bias)
1522#endif // defined(HAS_BIAS)
1523)
1524{
1525 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1526 src_stride_x,
1527 src_step_x,
1528 src_stride_y,
1529 src_step_y,
1530 src_stride_z,
1531 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001532 src_stride_w,
1533 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001534 src_offset_first_element_in_bytes,
1535 dst_ptr,
1536 dst_stride_x,
1537 dst_step_x,
1538 dst_stride_y,
1539 dst_step_y,
1540 dst_stride_z,
1541 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001542 dst_stride_w,
1543 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001544 dst_offset_first_element_in_bytes
1545#if defined(HAS_BIAS)
1546 ,
1547 bias_ptr,
1548 bias_stride_x,
1549 bias_step_x,
1550 bias_offset_first_element_in_bytes
1551#endif // defined(HAS_BIAS)
1552 );
1553}
1554
1555/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1556 *
1557 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1558 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1559 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1560 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001561 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001562 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001563 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001564 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1565 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1566 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1567 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1568 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1569 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001570 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1571 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001572 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1573 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1574 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1575 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1576 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1577 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001578 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1579 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1580 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1581 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001582 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1583 */
1584__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001585 TENSOR4D_DECLARATION(src),
1586 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001587#if defined(HAS_BIAS)
1588 ,
1589 VECTOR_DECLARATION(bias)
1590#endif // defined(HAS_BIAS)
1591)
1592{
1593 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1594 src_stride_x,
1595 src_step_x,
1596 src_stride_y,
1597 src_step_y,
1598 src_stride_z,
1599 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001600 src_stride_w,
1601 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001602 src_offset_first_element_in_bytes,
1603 dst_ptr,
1604 dst_stride_x,
1605 dst_step_x,
1606 dst_stride_y,
1607 dst_step_y,
1608 dst_stride_z,
1609 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001610 dst_stride_w,
1611 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001612 dst_offset_first_element_in_bytes
1613#if defined(HAS_BIAS)
1614 ,
1615 bias_ptr,
1616 bias_stride_x,
1617 bias_step_x,
1618 bias_offset_first_element_in_bytes
1619#endif // defined(HAS_BIAS)
1620 );
1621}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001622
1623/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1624 *
1625 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1626 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1627 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001628 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1629 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001630 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001631 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001632 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001633 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001634 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1635 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1636 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1637 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1638 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1639 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001640 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1641 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001642 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1643 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1644 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1645 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1646 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1647 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001648 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1649 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1650 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1651 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001652 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1653 */
1654__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001655 TENSOR4D_DECLARATION(src),
1656 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001657#if defined(HAS_BIAS)
1658 VECTOR_DECLARATION(bias),
1659#endif // defined(HAS_BIAS)
1660 int dst_size)
1661{
1662 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1663 src_stride_x,
1664 src_step_x,
1665 src_stride_y,
1666 src_step_y,
1667 src_stride_z,
1668 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001669 src_stride_w,
1670 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001671 src_offset_first_element_in_bytes,
1672 dst_ptr,
1673 dst_stride_x,
1674 dst_step_x,
1675 dst_stride_y,
1676 dst_step_y,
1677 dst_stride_z,
1678 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001679 dst_stride_w,
1680 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001681 dst_offset_first_element_in_bytes,
1682#if defined(HAS_BIAS)
1683 bias_ptr,
1684 bias_stride_x,
1685 bias_step_x,
1686 bias_offset_first_element_in_bytes,
1687#endif // defined(HAS_BIAS)
1688 dst_size);
1689}
1690
1691/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1692 *
1693 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1694 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1695 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001696 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1697 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001698 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001699 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001700 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001701 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001702 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1703 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1704 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1705 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1706 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1707 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001708 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1709 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001710 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1711 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1712 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1713 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1714 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1715 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001716 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1717 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1718 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1719 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001720 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1721 */
1722__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001723 TENSOR4D_DECLARATION(src),
1724 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001725#if defined(HAS_BIAS)
1726 VECTOR_DECLARATION(bias),
1727#endif // defined(HAS_BIAS)
1728 int dst_size)
1729{
1730 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1731 src_stride_x,
1732 src_step_x,
1733 src_stride_y,
1734 src_step_y,
1735 src_stride_z,
1736 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001737 src_stride_w,
1738 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001739 src_offset_first_element_in_bytes,
1740 dst_ptr,
1741 dst_stride_x,
1742 dst_step_x,
1743 dst_stride_y,
1744 dst_step_y,
1745 dst_stride_z,
1746 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001747 dst_stride_w,
1748 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001749 dst_offset_first_element_in_bytes,
1750#if defined(HAS_BIAS)
1751 bias_ptr,
1752 bias_stride_x,
1753 bias_step_x,
1754 bias_offset_first_element_in_bytes,
1755#endif // defined(HAS_BIAS)
1756 dst_size);
1757}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001758#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001759#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1760
1761#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001762#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001763/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1764 *
1765 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1766 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1767 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1768 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001769 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001770 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001771 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001772 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1773 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1774 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1775 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1776 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1777 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001778 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1779 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001780 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1781 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1782 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1783 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1784 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1785 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001786 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1787 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1788 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1789 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001790 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1791 */
1792__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001793 TENSOR4D_DECLARATION(src),
1794 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001795#if defined(HAS_BIAS)
1796 ,
1797 VECTOR_DECLARATION(bias)
1798#endif // defined(HAS_BIAS)
1799)
1800{
1801 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1802 src_stride_x,
1803 src_step_x,
1804 src_stride_y,
1805 src_step_y,
1806 src_stride_z,
1807 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001808 src_stride_w,
1809 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001810 src_offset_first_element_in_bytes,
1811 dst_ptr,
1812 dst_stride_x,
1813 dst_step_x,
1814 dst_stride_y,
1815 dst_step_y,
1816 dst_stride_z,
1817 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001818 dst_stride_w,
1819 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001820 dst_offset_first_element_in_bytes
1821#if defined(HAS_BIAS)
1822 ,
1823 bias_ptr,
1824 bias_stride_x,
1825 bias_step_x,
1826 bias_offset_first_element_in_bytes
1827#endif // defined(HAS_BIAS)
1828 );
1829}
giuros013bfacb22019-04-01 12:07:02 +01001830
1831/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1832 *
1833 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1834 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1835 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001836 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1837 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +01001838 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1839 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1840 *
1841 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1842 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1843 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1844 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1845 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1846 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1847 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1848 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1849 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1850 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1851 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1852 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1853 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1854 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1855 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1856 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1857 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1858 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1859 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1860 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1861 */
1862__kernel void winograd_output_transform_1x2_1x7_nhwc(
1863 TENSOR4D_DECLARATION(src),
1864 TENSOR4D_DECLARATION(dst),
1865#if defined(HAS_BIAS)
1866 VECTOR_DECLARATION(bias),
1867#endif // defined(HAS_BIAS)
1868 int dst_size)
1869{
1870 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1871 src_stride_x,
1872 src_step_x,
1873 src_stride_y,
1874 src_step_y,
1875 src_stride_z,
1876 src_step_z,
1877 src_stride_w,
1878 src_step_w,
1879 src_offset_first_element_in_bytes,
1880 dst_ptr,
1881 dst_stride_x,
1882 dst_step_x,
1883 dst_stride_y,
1884 dst_step_y,
1885 dst_stride_z,
1886 dst_step_z,
1887 dst_stride_w,
1888 dst_step_w,
1889 dst_offset_first_element_in_bytes,
1890#if defined(HAS_BIAS)
1891 bias_ptr,
1892 bias_stride_x,
1893 bias_step_x,
1894 bias_offset_first_element_in_bytes,
1895#endif // defined(HAS_BIAS)
1896 dst_size);
1897}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001898#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001899
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001900#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001901/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1902 *
1903 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1904 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1905 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1906 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001907 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001908 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001909 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001910 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1911 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1912 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1913 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1914 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1915 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001916 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1917 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001918 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1919 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1920 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1921 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1922 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1923 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001924 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1925 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1926 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1927 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001928 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1929 */
1930__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001931 TENSOR4D_DECLARATION(src),
1932 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001933#if defined(HAS_BIAS)
1934 ,
1935 VECTOR_DECLARATION(bias)
1936#endif // defined(HAS_BIAS)
1937)
1938{
1939 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1940 src_stride_x,
1941 src_step_x,
1942 src_stride_y,
1943 src_step_y,
1944 src_stride_z,
1945 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001946 src_stride_w,
1947 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001948 src_offset_first_element_in_bytes,
1949 dst_ptr,
1950 dst_stride_x,
1951 dst_step_x,
1952 dst_stride_y,
1953 dst_step_y,
1954 dst_stride_z,
1955 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001956 dst_stride_w,
1957 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001958 dst_offset_first_element_in_bytes
1959#if defined(HAS_BIAS)
1960 ,
1961 bias_ptr,
1962 bias_stride_x,
1963 bias_step_x,
1964 bias_offset_first_element_in_bytes
1965#endif // defined(HAS_BIAS)
1966 );
1967}
1968
1969/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1970 *
1971 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1972 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1973 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1974 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001975 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001976 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001977 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001978 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1979 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1980 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1981 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1982 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1983 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001984 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1985 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001986 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1987 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1988 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1989 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1990 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1991 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001992 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1993 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1994 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1995 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001996 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1997 */
1998__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001999 TENSOR4D_DECLARATION(src),
2000 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002001#if defined(HAS_BIAS)
2002 ,
2003 VECTOR_DECLARATION(bias)
2004#endif // defined(HAS_BIAS)
2005)
2006{
2007 winograd_output_transform_4x4_5x5_nchw(src_ptr,
2008 src_stride_x,
2009 src_step_x,
2010 src_stride_y,
2011 src_step_y,
2012 src_stride_z,
2013 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002014 src_stride_w,
2015 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002016 src_offset_first_element_in_bytes,
2017 dst_ptr,
2018 dst_stride_x,
2019 dst_step_x,
2020 dst_stride_y,
2021 dst_step_y,
2022 dst_stride_z,
2023 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002024 dst_stride_w,
2025 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002026 dst_offset_first_element_in_bytes
2027#if defined(HAS_BIAS)
2028 ,
2029 bias_ptr,
2030 bias_stride_x,
2031 bias_step_x,
2032 bias_offset_first_element_in_bytes
2033#endif // defined(HAS_BIAS)
2034 );
2035}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002036
2037/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
2038 *
2039 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2040 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2041 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice5f910412020-10-20 09:14:45 +01002042 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
2043 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002044 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002045 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002046 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002047 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002048 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2049 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2050 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2051 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2052 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2053 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002054 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2055 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002056 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2057 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2058 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2059 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2060 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2061 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002062 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2063 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2064 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2065 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002066 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2067 */
2068__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002069 TENSOR4D_DECLARATION(src),
2070 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002071#if defined(HAS_BIAS)
2072 VECTOR_DECLARATION(bias),
2073#endif // defined(HAS_BIAS)
2074 int dst_size)
2075{
2076 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
2077 src_stride_x,
2078 src_step_x,
2079 src_stride_y,
2080 src_step_y,
2081 src_stride_z,
2082 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002083 src_stride_w,
2084 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002085 src_offset_first_element_in_bytes,
2086 dst_ptr,
2087 dst_stride_x,
2088 dst_step_x,
2089 dst_stride_y,
2090 dst_step_y,
2091 dst_stride_z,
2092 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002093 dst_stride_w,
2094 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002095 dst_offset_first_element_in_bytes,
2096#if defined(HAS_BIAS)
2097 bias_ptr,
2098 bias_stride_x,
2099 bias_step_x,
2100 bias_offset_first_element_in_bytes,
2101#endif // defined(HAS_BIAS)
2102 dst_size);
2103}
2104
2105/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
2106 *
2107 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
2108 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2109 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice5f910412020-10-20 09:14:45 +01002110 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
2111 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002112 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002113 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002114 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002115 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002116 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2117 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2118 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2119 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2120 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2121 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002122 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2123 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002124 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2125 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
2126 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2127 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2128 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2129 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002130 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2131 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
2132 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2133 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002134 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2135 */
2136__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002137 TENSOR4D_DECLARATION(src),
2138 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002139#if defined(HAS_BIAS)
2140 VECTOR_DECLARATION(bias),
2141#endif // defined(HAS_BIAS)
2142 int dst_size)
2143{
2144 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
2145 src_stride_x,
2146 src_step_x,
2147 src_stride_y,
2148 src_step_y,
2149 src_stride_z,
2150 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002151 src_stride_w,
2152 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002153 src_offset_first_element_in_bytes,
2154 dst_ptr,
2155 dst_stride_x,
2156 dst_step_x,
2157 dst_stride_y,
2158 dst_step_y,
2159 dst_stride_z,
2160 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002161 dst_stride_w,
2162 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002163 dst_offset_first_element_in_bytes,
2164#if defined(HAS_BIAS)
2165 bias_ptr,
2166 bias_stride_x,
2167 bias_step_x,
2168 bias_offset_first_element_in_bytes,
2169#endif // defined(HAS_BIAS)
2170 dst_size);
2171}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002172#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002173#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002174#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)