blob: 2c7c05fdd16387b0e1191e71bcc0b1f87fb0d95c [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Georgios Pinitasffb57a02018-10-29 18:01:52 +000026#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010027/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
28 *
29 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
30 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
31 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
32 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
33 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010034 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010035 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010036 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010037 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
38 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
39 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
40 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
41 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
42 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010043 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
44 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010045 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
46 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
47 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
48 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
49 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
50 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010051 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
52 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
53 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
54 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010055 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
56 */
57__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010058 TENSOR4D_DECLARATION(src),
59 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010060#if defined(HAS_BIAS)
61 ,
62 VECTOR_DECLARATION(bias)
63#endif // defined(HAS_BIAS)
64)
65{
66 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000067#if defined(SRC_DEPTH)
68 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010069 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000070#else /* defined(SRC_DEPTH) */
71 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
72 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
73#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010074
75 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010076 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
77 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
78 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
79 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
82 // Compute the 2x1 or 1x2 output tile
83 // out00 = d00 + d01 + d02
84 // out01 = d01 - d02 - d03
85
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010086 DATA_TYPE out00 = d00 + d01 + d02;
87 DATA_TYPE out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010088#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010089 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
90 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
91 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
92 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010093
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010094 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
95 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
96 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
97 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010098
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010099 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
100 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
101 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
102 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100103
104 // Compute the 2x2 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100105 DATA_TYPE k0 = d01 + d11 + d21;
106 DATA_TYPE k1 = d02 + d12 + d22;
107 DATA_TYPE k2 = d11 - d21 - d31;
108 DATA_TYPE k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100109
110 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
111 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
112 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
113 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
114
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100115 DATA_TYPE out00 = d10;
116 DATA_TYPE out01 = -d13;
117 DATA_TYPE out10 = d10;
118 DATA_TYPE out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100119
120 out00 += d00 + d20 + k0 + k1;
121 out01 += k0 - k1 - (d03 + d23);
122 out10 += -d20 - d30 + k2 + k3;
123 out11 += k2 - k3 + d23 + d33;
124#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
125
126 int y_in = get_global_id(1);
127 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
128 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
129 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000130#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100131 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000132#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100133
134#if defined(HAS_BIAS)
135 // Add bias
136 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
137
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100138 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100139
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100140 out00 += (DATA_TYPE)b;
141 out01 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100142#endif // defined(HAS_BIAS)
143
144 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000145#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100146 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000147#else /* defined(SRC_DEPTH) */
148 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
149#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100150
151 // Store the output tile
152#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100153 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
154 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100155#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100156 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
158
159#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
160#if defined(HAS_BIAS)
161 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100162 out10 += (DATA_TYPE)b;
163 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#endif // defined(HAS_BIAS)
165
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100166 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out10, out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168}
169
170/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
171 *
172 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
173 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
174 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
175 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
176 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100177 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100178 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100179 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100180 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
181 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
182 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
183 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
184 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
185 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100186 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
187 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100188 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
189 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
190 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
191 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
192 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
193 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100194 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
195 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
196 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
197 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100198 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
199 */
200__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100201 TENSOR4D_DECLARATION(src),
202 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100203#if defined(HAS_BIAS)
204 ,
205 VECTOR_DECLARATION(bias)
206#endif // defined(HAS_BIAS)
207)
208{
209 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000210#if defined(SRC_DEPTH)
211 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100212 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000213#else /* defined(SRC_DEPTH) */
214 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
215 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
216#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100217
218 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100219 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
220 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
221 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
222 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
223 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
224 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100225
226#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
227 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100228 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;
229 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
230 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
231 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100232#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100233 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
234 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
235 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
236 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
237 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
238 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100239
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100240 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
241 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
242 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
243 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
244 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
245 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100246
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100247 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
248 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
249 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
250 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
251 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
252 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100253
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100254 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
255 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
256 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
257 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
258 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
259 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100260
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100261 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
262 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
263 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
264 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
265 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
266 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100267
268 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100269 DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;
270 DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;
271 DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;
272 DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100273
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100274 DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
275 DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100276
277 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
278 out01 += k1 - d02 - d12 - d22 - d32 - d42;
279 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
280 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
281
282 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100283 DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
284 DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
285 DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
286 DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100287
288 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
289 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
290
291 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
292 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
293 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
294 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
295
296 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100297 DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
298 DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
299 DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
300 DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100301
302 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
303 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
304
305 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
306 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
307 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
308 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
309
310 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100311 DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
312 DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
313 DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
314 DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100315
316 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
317 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
318
319 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
320 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
321 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
322 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
323#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
324
325 int y_in = get_global_id(1);
326 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
327 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
328 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000329#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100330 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000331#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100332
333#if defined(HAS_BIAS)
334 // Add bias
335 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
336
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100337 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100338
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100339 out00 += (DATA_TYPE)b;
340 out01 += (DATA_TYPE)b;
341 out02 += (DATA_TYPE)b;
342 out03 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100343#endif // defined(HAS_BIAS)
344
345 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000346#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100347 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000348#else /* defined(SRC_DEPTH) */
349 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
350#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100351
352 // Store the output tile
353#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100354 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
355 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
356 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
357 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100358#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100359 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100360#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
361
362#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
363#if defined(HAS_BIAS)
364 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100365 out10 += (DATA_TYPE)b;
366 out11 += (DATA_TYPE)b;
367 out12 += (DATA_TYPE)b;
368 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100369
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100370 out20 += (DATA_TYPE)b;
371 out21 += (DATA_TYPE)b;
372 out22 += (DATA_TYPE)b;
373 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100374
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100375 out30 += (DATA_TYPE)b;
376 out31 += (DATA_TYPE)b;
377 out32 += (DATA_TYPE)b;
378 out33 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100379#endif // defined(HAS_BIAS)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100380 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out10, out11, out12, out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
381 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out20, out21, out22, out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
382 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out30, out31, out32, out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100383#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
384}
385
Giorgio Arena149fdf32018-07-04 17:03:33 +0100386/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100387 *
388 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100389 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
390 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
391 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
392 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100393 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100394 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100395 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100396 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
397 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
398 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
399 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
400 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
401 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100402 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
403 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100404 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
405 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
406 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
407 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
408 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
409 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100410 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
411 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
412 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
413 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100414 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
415 * @param[in] dst_size Size of the destination tensor, minus the last padding
416 */
417__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100418 TENSOR4D_DECLARATION(src),
419 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100420#if defined(HAS_BIAS)
421 VECTOR_DECLARATION(bias),
422#endif // defined(HAS_BIAS)
423 int dst_size)
424{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100425 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000426#if defined(SRC_DEPTH)
427 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100428 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000429#else /* defined(SRC_DEPTH) */
430 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
431 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
432#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100433
Giorgio Arena149fdf32018-07-04 17:03:33 +0100434 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100435 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
436 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
437 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
438 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
439 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
440 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100441
Giorgio Arena149fdf32018-07-04 17:03:33 +0100442#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
443 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100444 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04;
445 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
446 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
447 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100448#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
449
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100450 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
451 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
452 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
453 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
454 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
455 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100456
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100457 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
458 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
459 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
460 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
461 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
462 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100463
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100464 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
465 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
466 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
467 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
468 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
469 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100470
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100471 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
472 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
473 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
474 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
475 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
476 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100477
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100478 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
479 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
480 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
481 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
482 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
483 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100484
485 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100486 DATA_TYPE out00 = d01 + d21 + d41 + d11 + d31;
487 DATA_TYPE out01 = d01 + d21 + d41 + d11 + d31;
488 DATA_TYPE out02 = d01 + d21 + d41 + d11 + d31;
489 DATA_TYPE out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100490
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100491 DATA_TYPE k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
492 DATA_TYPE k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100493
494 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
495 out01 += k1 - d02 - d12 - d22 - d32 - d42;
496 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
497 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
498
499 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100500 DATA_TYPE out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
501 DATA_TYPE out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
502 DATA_TYPE out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
503 DATA_TYPE out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
505 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
506 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
507
508 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
509 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
510 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
511 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
512
513 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100514 DATA_TYPE out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
515 DATA_TYPE out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
516 DATA_TYPE out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
517 DATA_TYPE out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100518
519 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
520 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
521
522 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
523 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
524 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
525 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
526
527 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100528 DATA_TYPE out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
529 DATA_TYPE out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
530 DATA_TYPE out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
531 DATA_TYPE out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100532
533 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
534 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
535
536 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
537 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
538 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
539 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100540#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100541
542 int y_in = get_global_id(1);
543 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100544 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
545 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000546#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100547 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000548#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100549
550#if defined(HAS_BIAS)
551 // Add bias
552 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
553
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100554 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100555
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100556 out00 += (DATA_TYPE)b;
557 out01 += (DATA_TYPE)b;
558 out02 += (DATA_TYPE)b;
559 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100560#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100561 out10 += (DATA_TYPE)b;
562 out11 += (DATA_TYPE)b;
563 out12 += (DATA_TYPE)b;
564 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100565
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100566 out20 += (DATA_TYPE)b;
567 out21 += (DATA_TYPE)b;
568 out22 += (DATA_TYPE)b;
569 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100570
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100571 out30 += (DATA_TYPE)b;
572 out31 += (DATA_TYPE)b;
573 out32 += (DATA_TYPE)b;
574 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100575#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100576
577#endif // defined(HAS_BIAS)
578
Giorgio Arena149fdf32018-07-04 17:03:33 +0100579#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000580#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100581 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000582#else /* defined(SRC_DEPTH) */
583 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
584#endif /* defined(SRC_DEPTH) */
585 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100586
587 // Store the 1x4 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100588 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out00;
589 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out01;
590 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out02;
591 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out03;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100592#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
593 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100594 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100595 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100596
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100597 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;
598 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;
599 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;
600 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100601#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100602 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000603#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100604 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000605#else /* defined(SRC_DEPTH) */
606 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
607#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100608 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100609 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100610
611 // Store the 4x4 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100612 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
613 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
614 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
615 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
616 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
617 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
618 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
619 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
620 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
621 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
622 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
623 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
624 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
625 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
626 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
627 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100628
629#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100630}
631
632#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
633 ({ \
634 comm_fact.s0 = d1 + d2; \
635 comm_fact.s1 = d3 + d4; \
636 comm_fact.s2 = d5 + d6; \
637 \
638 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
639 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
640 \
641 comm_fact.s0 = d1 - d2; \
642 comm_fact.s1 = d3 - d4; \
643 comm_fact.s2 = d5 - d6; \
644 \
645 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
646 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
647 })
648
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100649/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100650 *
651 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100652 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
653 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
654 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
655 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100656 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100657 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100658 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100659 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
660 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
662 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
663 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
664 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100665 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
666 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100667 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
668 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
669 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
670 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
671 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
672 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100673 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
674 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
675 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
676 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100677 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
678 */
679__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100680 TENSOR4D_DECLARATION(src),
681 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100682#if defined(HAS_BIAS)
683 ,
684 VECTOR_DECLARATION(bias)
685#endif // defined(HAS_BIAS)
686)
687{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100688 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000689#if defined(SRC_DEPTH)
690 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100691 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000692#else /* defined(SRC_DEPTH) */
693 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
694 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
695#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100696
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100697 // Compute output address
698 int y_in = get_global_id(1);
699 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
700 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
701 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000702#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100703 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000704#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100705
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000706#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100707 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000708#else /* defined(SRC_DEPTH) */
709 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
710#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100711
712 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100713 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
714 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
715 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
716 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
717 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
718 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
719 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
720 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100721
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100722#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
723 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
725 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
726 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
727 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100728
729#if defined(HAS_BIAS)
730 // Add bias
731 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
732
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100733 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100734
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100735 out00 += (DATA_TYPE)b;
736 out01 += (DATA_TYPE)b;
737 out02 += (DATA_TYPE)b;
738 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100739#endif // defined(HAS_BIAS)
740
741 // Store the output tile
742#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100743 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
744 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
745 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
746 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100747#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100748 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100749#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
750
751#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000752 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
753 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
754 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
755 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
756 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
757 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
758 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
759 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100760
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100761 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
762 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
763 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
764 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
765 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
766 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
767 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
768 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100769
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100770 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
771 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
772 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
773 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
774 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
775 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
776 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
777 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100778
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100779 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
780 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
781 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
782 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
783 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
784 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
785 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
786 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100787
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100788 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
789 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
790 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
791 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
792 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
793 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
794 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
795 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100796
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100797 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
798 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
799 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
800 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
801 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
802 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
803 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
804 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100805
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100806 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
807 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
808 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
809 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
810 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
811 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
812 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
813 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100814
815 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100816 VEC_DATA_TYPE(DATA_TYPE, 4)
817 comm_fact0, comm_fact1, comm_fact2;
818 VEC_DATA_TYPE(DATA_TYPE, 4)
819 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100820
821 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
822 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
823 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
824 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
825 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
826 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
827 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
828 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
829
830 // Compute the 4x4 output tile
831 comm_fact0 = tmp_col1 + tmp_col2;
832 comm_fact1 = tmp_col3 + tmp_col4;
833 comm_fact2 = tmp_col5 + tmp_col6;
834
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100835 VEC_DATA_TYPE(DATA_TYPE, 4)
836 out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;
837 VEC_DATA_TYPE(DATA_TYPE, 4)
838 out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100839
840 comm_fact0 = tmp_col1 - tmp_col2;
841 comm_fact1 = tmp_col3 - tmp_col4;
842 comm_fact2 = tmp_col5 - tmp_col6;
843
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100844 VEC_DATA_TYPE(DATA_TYPE, 4)
845 out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;
846 VEC_DATA_TYPE(DATA_TYPE, 4)
847 out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100848
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100849#if defined(HAS_BIAS)
850 // Add bias
851 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
852
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100853 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100854
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100855 out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
856 out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
857 out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
858 out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100859#endif // defined(HAS_BIAS)
860
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100861 // Store the output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100862 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
863 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
864 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
865 vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100866#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100867}
868
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100869/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100870 *
871 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100872 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
873 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
874 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
875 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100876 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100877 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100878 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100879 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
880 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
881 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
882 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
883 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
884 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100885 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
886 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100887 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
888 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
889 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
890 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
891 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
892 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100893 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
894 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
895 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
896 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100897 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
898 */
899__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100900 TENSOR4D_DECLARATION(src),
901 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100902#if defined(HAS_BIAS)
903 VECTOR_DECLARATION(bias),
904#endif // defined(HAS_BIAS)
905 int dst_size)
906{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100907 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000908#if defined(SRC_DEPTH)
909 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100910 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000911#else /* defined(SRC_DEPTH) */
912 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
913 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
914#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100915
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100916 int y_in = get_global_id(1);
917 int x_out = get_global_id(0);
918 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
919 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000920#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100921 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000922#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100923
924 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100925 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
926 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
927 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
928 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
929 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
930 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
931 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
932 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100933
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100934#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
935 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100936 DATA_TYPE out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
937 DATA_TYPE out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
938 DATA_TYPE out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
939 DATA_TYPE out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100940
941#if defined(HAS_BIAS)
942 // Add bias
943 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
944
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100945 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100946
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100947 out00 += (DATA_TYPE)b;
948 out01 += (DATA_TYPE)b;
949 out02 += (DATA_TYPE)b;
950 out03 += (DATA_TYPE)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100951#endif // defined(HAS_BIAS)
952
953 // Store the output tile
954#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
955 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000956#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100957 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000958#else /* defined(SRC_DEPTH) */
959 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
960#endif /* defined(SRC_DEPTH) */
961 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100962
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100963 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out00;
964 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out01;
965 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out02;
966 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out03;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100967#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
968 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100969 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100970
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100971 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out00;
972 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out01;
973 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out02;
974 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out03;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100975#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
976
977#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
978
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100979 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
980 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
981 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
982 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
983 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
984 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
985 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
986 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100987
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100988 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
989 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
990 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
991 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
992 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
993 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
994 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
995 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100996
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100997 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
998 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
999 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1000 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1001 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1002 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1003 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1004 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001005
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001006 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1007 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1008 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1009 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1010 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1011 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1012 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1013 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001014
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001015 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1016 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1017 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1018 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1019 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1020 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1021 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1022 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001023
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001024 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1025 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1026 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1027 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1028 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1029 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1030 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1031 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001032
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001033 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1034 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1035 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1036 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1037 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1038 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1039 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1040 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001041
1042 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001043 VEC_DATA_TYPE(DATA_TYPE, 4)
1044 comm_fact0, comm_fact1, comm_fact2;
1045 VEC_DATA_TYPE(DATA_TYPE, 4)
1046 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001047
1048 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1049 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1050 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1051 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1052 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1053 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1054 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1055 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1056
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001057 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001058 comm_fact0 = tmp_col1 + tmp_col2;
1059 comm_fact1 = tmp_col3 + tmp_col4;
1060 comm_fact2 = tmp_col5 + tmp_col6;
1061
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001062 VEC_DATA_TYPE(DATA_TYPE, 4)
1063 out_col0 = comm_fact0 + comm_fact1 + (DATA_TYPE)8.f * comm_fact2 + tmp_col0;
1064 VEC_DATA_TYPE(DATA_TYPE, 4)
1065 out_col2 = comm_fact0 + (DATA_TYPE)4.f * comm_fact1 + (DATA_TYPE)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001066
1067 comm_fact0 = tmp_col1 - tmp_col2;
1068 comm_fact1 = tmp_col3 - tmp_col4;
1069 comm_fact2 = tmp_col5 - tmp_col6;
1070
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001071 VEC_DATA_TYPE(DATA_TYPE, 4)
1072 out_col1 = comm_fact0 + (DATA_TYPE)2.f * comm_fact1 + (DATA_TYPE)4.f * comm_fact2;
1073 VEC_DATA_TYPE(DATA_TYPE, 4)
1074 out_col3 = comm_fact0 + (DATA_TYPE)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001075
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001076#if defined(HAS_BIAS)
1077 // Add bias
1078 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1079
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001080 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001081
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001082 out_col0 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1083 out_col1 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1084 out_col2 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
1085 out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001086#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001087 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001088#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001089 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001090#else /* defined(SRC_DEPTH) */
1091 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1092#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001093 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1094 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001095
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001096 // Store the output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001097 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;
1098 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;
1099 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;
1100 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;
1101 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;
1102 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;
1103 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;
1104 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;
1105 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;
1106 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;
1107 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;
1108 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;
1109 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;
1110 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;
1111 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;
1112 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001113#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001114}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001115
1116#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1117/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1118 *
1119 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1120 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1121 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1122 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001123 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001124 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001125 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001126 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1127 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1128 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1129 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1130 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1131 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001132 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1133 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001134 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1135 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1136 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1137 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1138 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1139 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001140 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1141 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1142 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1143 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001144 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1145 */
1146__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001147 TENSOR4D_DECLARATION(src),
1148 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001149#if defined(HAS_BIAS)
1150 ,
1151 VECTOR_DECLARATION(bias)
1152#endif // defined(HAS_BIAS)
1153)
1154{
1155 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1156 src_stride_x,
1157 src_step_x,
1158 src_stride_y,
1159 src_step_y,
1160 src_stride_z,
1161 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001162 src_stride_w,
1163 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001164 src_offset_first_element_in_bytes,
1165 dst_ptr,
1166 dst_stride_x,
1167 dst_step_x,
1168 dst_stride_y,
1169 dst_step_y,
1170 dst_stride_z,
1171 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001172 dst_stride_w,
1173 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001174 dst_offset_first_element_in_bytes
1175#if defined(HAS_BIAS)
1176 ,
1177 bias_ptr,
1178 bias_stride_x,
1179 bias_step_x,
1180 bias_offset_first_element_in_bytes
1181#endif // defined(HAS_BIAS)
1182 );
1183}
1184
1185/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1186 *
1187 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1188 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1189 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1190 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001191 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001192 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001193 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001194 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1195 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1196 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1197 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1198 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1199 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001200 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1201 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001202 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1203 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1204 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1205 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1206 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1207 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001208 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1209 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1210 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1211 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001212 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1213 */
1214__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001215 TENSOR4D_DECLARATION(src),
1216 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001217#if defined(HAS_BIAS)
1218 ,
1219 VECTOR_DECLARATION(bias)
1220#endif // defined(HAS_BIAS)
1221)
1222{
1223 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1224 src_stride_x,
1225 src_step_x,
1226 src_stride_y,
1227 src_step_y,
1228 src_stride_z,
1229 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001230 src_stride_w,
1231 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001232 src_offset_first_element_in_bytes,
1233 dst_ptr,
1234 dst_stride_x,
1235 dst_step_x,
1236 dst_stride_y,
1237 dst_step_y,
1238 dst_stride_z,
1239 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001240 dst_stride_w,
1241 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001242 dst_offset_first_element_in_bytes
1243#if defined(HAS_BIAS)
1244 ,
1245 bias_ptr,
1246 bias_stride_x,
1247 bias_step_x,
1248 bias_offset_first_element_in_bytes
1249#endif // defined(HAS_BIAS)
1250 );
1251}
1252
1253/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1254 *
1255 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1256 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1257 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1258 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001259 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001260 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001261 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001262 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1263 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1264 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1265 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1266 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1267 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001268 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1269 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001270 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1271 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1272 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1273 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1274 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1275 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001276 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1277 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1278 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1279 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001280 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1281 */
1282__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001283 TENSOR4D_DECLARATION(src),
1284 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001285#if defined(HAS_BIAS)
1286 ,
1287 VECTOR_DECLARATION(bias)
1288#endif // defined(HAS_BIAS)
1289)
1290{
1291 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1292 src_stride_x,
1293 src_step_x,
1294 src_stride_y,
1295 src_step_y,
1296 src_stride_z,
1297 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001298 src_stride_w,
1299 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001300 src_offset_first_element_in_bytes,
1301 dst_ptr,
1302 dst_stride_x,
1303 dst_step_x,
1304 dst_stride_y,
1305 dst_step_y,
1306 dst_stride_z,
1307 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001308 dst_stride_w,
1309 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001310 dst_offset_first_element_in_bytes
1311#if defined(HAS_BIAS)
1312 ,
1313 bias_ptr,
1314 bias_stride_x,
1315 bias_step_x,
1316 bias_offset_first_element_in_bytes
1317#endif // defined(HAS_BIAS)
1318 );
1319}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001320
1321/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1322 *
1323 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1324 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1325 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1326 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001327 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001328 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001329 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001330 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1331 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1332 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1333 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1334 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1335 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001336 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1337 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001338 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1339 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1340 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1341 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1342 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1343 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001344 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1345 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1346 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1347 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001348 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1349 */
1350__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001351 TENSOR4D_DECLARATION(src),
1352 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001353#if defined(HAS_BIAS)
1354 VECTOR_DECLARATION(bias),
1355#endif // defined(HAS_BIAS)
1356 int dst_size)
1357{
1358 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1359 src_stride_x,
1360 src_step_x,
1361 src_stride_y,
1362 src_step_y,
1363 src_stride_z,
1364 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001365 src_stride_w,
1366 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001367 src_offset_first_element_in_bytes,
1368 dst_ptr,
1369 dst_stride_x,
1370 dst_step_x,
1371 dst_stride_y,
1372 dst_step_y,
1373 dst_stride_z,
1374 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001375 dst_stride_w,
1376 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001377 dst_offset_first_element_in_bytes,
1378#if defined(HAS_BIAS)
1379 bias_ptr,
1380 bias_stride_x,
1381 bias_step_x,
1382 bias_offset_first_element_in_bytes,
1383#endif // defined(HAS_BIAS)
1384 dst_size);
1385}
1386
1387/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1388 *
1389 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1390 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1391 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1392 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001393 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001394 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001395 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001396 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1397 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1398 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1399 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1400 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1401 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001402 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1403 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001404 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1405 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1406 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1407 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1408 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1409 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001410 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1411 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1412 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1413 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001414 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1415 */
1416__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001417 TENSOR4D_DECLARATION(src),
1418 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001419#if defined(HAS_BIAS)
1420 VECTOR_DECLARATION(bias),
1421#endif // defined(HAS_BIAS)
1422 int dst_size)
1423{
1424 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1425 src_stride_x,
1426 src_step_x,
1427 src_stride_y,
1428 src_step_y,
1429 src_stride_z,
1430 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001431 src_stride_w,
1432 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001433 src_offset_first_element_in_bytes,
1434 dst_ptr,
1435 dst_stride_x,
1436 dst_step_x,
1437 dst_stride_y,
1438 dst_step_y,
1439 dst_stride_z,
1440 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001441 dst_stride_w,
1442 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001443 dst_offset_first_element_in_bytes,
1444#if defined(HAS_BIAS)
1445 bias_ptr,
1446 bias_stride_x,
1447 bias_step_x,
1448 bias_offset_first_element_in_bytes,
1449#endif // defined(HAS_BIAS)
1450 dst_size);
1451}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001452#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1453
1454#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1455/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1456 *
1457 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1458 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1459 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1460 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001461 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001462 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001463 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001464 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1465 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1466 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1467 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1468 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1469 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001470 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1471 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001472 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1473 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001478 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1479 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1480 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1481 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001482 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1483 */
1484__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001485 TENSOR4D_DECLARATION(src),
1486 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001487#if defined(HAS_BIAS)
1488 ,
1489 VECTOR_DECLARATION(bias)
1490#endif // defined(HAS_BIAS)
1491)
1492{
1493 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1494 src_stride_x,
1495 src_step_x,
1496 src_stride_y,
1497 src_step_y,
1498 src_stride_z,
1499 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001500 src_stride_w,
1501 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001502 src_offset_first_element_in_bytes,
1503 dst_ptr,
1504 dst_stride_x,
1505 dst_step_x,
1506 dst_stride_y,
1507 dst_step_y,
1508 dst_stride_z,
1509 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001510 dst_stride_w,
1511 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001512 dst_offset_first_element_in_bytes
1513#if defined(HAS_BIAS)
1514 ,
1515 bias_ptr,
1516 bias_stride_x,
1517 bias_step_x,
1518 bias_offset_first_element_in_bytes
1519#endif // defined(HAS_BIAS)
1520 );
1521}
1522
1523/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1524 *
1525 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1526 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1527 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1528 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001529 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001530 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001531 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001532 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1533 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1534 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1535 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1536 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1537 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001538 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1539 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001540 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1541 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1542 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1543 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1544 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1545 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001546 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1547 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1548 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1549 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001550 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1551 */
1552__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001553 TENSOR4D_DECLARATION(src),
1554 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001555#if defined(HAS_BIAS)
1556 ,
1557 VECTOR_DECLARATION(bias)
1558#endif // defined(HAS_BIAS)
1559)
1560{
1561 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1562 src_stride_x,
1563 src_step_x,
1564 src_stride_y,
1565 src_step_y,
1566 src_stride_z,
1567 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001568 src_stride_w,
1569 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001570 src_offset_first_element_in_bytes,
1571 dst_ptr,
1572 dst_stride_x,
1573 dst_step_x,
1574 dst_stride_y,
1575 dst_step_y,
1576 dst_stride_z,
1577 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001578 dst_stride_w,
1579 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001580 dst_offset_first_element_in_bytes
1581#if defined(HAS_BIAS)
1582 ,
1583 bias_ptr,
1584 bias_stride_x,
1585 bias_step_x,
1586 bias_offset_first_element_in_bytes
1587#endif // defined(HAS_BIAS)
1588 );
1589}
1590
1591/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1592 *
1593 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1594 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1595 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1596 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001597 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001598 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001599 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001600 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1601 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1602 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1603 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1604 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1605 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001606 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1607 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001608 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1609 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1610 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1611 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1612 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1613 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001614 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1615 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1616 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1617 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001618 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1619 */
1620__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001621 TENSOR4D_DECLARATION(src),
1622 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001623#if defined(HAS_BIAS)
1624 ,
1625 VECTOR_DECLARATION(bias)
1626#endif // defined(HAS_BIAS)
1627)
1628{
1629 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1630 src_stride_x,
1631 src_step_x,
1632 src_stride_y,
1633 src_step_y,
1634 src_stride_z,
1635 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001636 src_stride_w,
1637 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001638 src_offset_first_element_in_bytes,
1639 dst_ptr,
1640 dst_stride_x,
1641 dst_step_x,
1642 dst_stride_y,
1643 dst_step_y,
1644 dst_stride_z,
1645 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001646 dst_stride_w,
1647 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001648 dst_offset_first_element_in_bytes
1649#if defined(HAS_BIAS)
1650 ,
1651 bias_ptr,
1652 bias_stride_x,
1653 bias_step_x,
1654 bias_offset_first_element_in_bytes
1655#endif // defined(HAS_BIAS)
1656 );
1657}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001658
1659/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
1660 *
1661 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1662 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1663 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1664 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001665 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001666 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001667 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001668 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1669 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1670 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1671 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1672 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1673 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001674 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1675 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001676 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1677 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1678 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1679 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1680 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1681 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001682 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1683 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1684 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1685 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001686 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1687 */
1688__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001689 TENSOR4D_DECLARATION(src),
1690 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001691#if defined(HAS_BIAS)
1692 VECTOR_DECLARATION(bias),
1693#endif // defined(HAS_BIAS)
1694 int dst_size)
1695{
1696 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1697 src_stride_x,
1698 src_step_x,
1699 src_stride_y,
1700 src_step_y,
1701 src_stride_z,
1702 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001703 src_stride_w,
1704 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001705 src_offset_first_element_in_bytes,
1706 dst_ptr,
1707 dst_stride_x,
1708 dst_step_x,
1709 dst_stride_y,
1710 dst_step_y,
1711 dst_stride_z,
1712 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001713 dst_stride_w,
1714 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001715 dst_offset_first_element_in_bytes,
1716#if defined(HAS_BIAS)
1717 bias_ptr,
1718 bias_stride_x,
1719 bias_step_x,
1720 bias_offset_first_element_in_bytes,
1721#endif // defined(HAS_BIAS)
1722 dst_size);
1723}
1724
1725/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1726 *
1727 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1728 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1729 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1730 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001731 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001732 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001733 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001734 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1735 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1736 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1737 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1738 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1739 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001740 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1741 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001742 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1743 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1744 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1745 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1746 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1747 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001748 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1749 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1750 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1751 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001752 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1753 */
1754__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001755 TENSOR4D_DECLARATION(src),
1756 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001757#if defined(HAS_BIAS)
1758 VECTOR_DECLARATION(bias),
1759#endif // defined(HAS_BIAS)
1760 int dst_size)
1761{
1762 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1763 src_stride_x,
1764 src_step_x,
1765 src_stride_y,
1766 src_step_y,
1767 src_stride_z,
1768 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001769 src_stride_w,
1770 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001771 src_offset_first_element_in_bytes,
1772 dst_ptr,
1773 dst_stride_x,
1774 dst_step_x,
1775 dst_stride_y,
1776 dst_step_y,
1777 dst_stride_z,
1778 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001779 dst_stride_w,
1780 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001781 dst_offset_first_element_in_bytes,
1782#if defined(HAS_BIAS)
1783 bias_ptr,
1784 bias_stride_x,
1785 bias_step_x,
1786 bias_offset_first_element_in_bytes,
1787#endif // defined(HAS_BIAS)
1788 dst_size);
1789}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001790#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001791#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)