blob: e979978fa2d9e35289ee4e972c69175418d2798a [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Manuel Bottini0d0028c2018-10-02 16:41:52 +010026#if defined(FUSED_ACTIVATION)
27#include "activation_layer.cl"
28#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
29#else /* defined(FUSED_ACTIVATION) */
30#define ACTIVATION_FUNC(x) (x)
31#endif /* defined(FUSED_ACTIVATION) */
32
Georgios Pinitasffb57a02018-10-29 18:01:52 +000033#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010034#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010035/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
36 *
37 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
38 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
39 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
40 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
41 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Manuel Bottini0d0028c2018-10-02 16:41:52 +010043 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
44 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
45 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
46 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010047 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010048 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010049 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
50 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
51 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
52 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
53 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
54 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010055 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
56 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010057 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
58 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
59 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
60 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
61 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
62 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010063 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
64 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
65 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
66 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010067 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
68 */
69__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010070 TENSOR4D_DECLARATION(src),
71 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010072#if defined(HAS_BIAS)
73 ,
74 VECTOR_DECLARATION(bias)
75#endif // defined(HAS_BIAS)
76)
77{
78 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000079#if defined(SRC_DEPTH)
80 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010081 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000082#else /* defined(SRC_DEPTH) */
83 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
84 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
85#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010088 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
89 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
90 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
91 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010092
93#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
94 // Compute the 2x1 or 1x2 output tile
95 // out00 = d00 + d01 + d02
96 // out01 = d01 - d02 - d03
97
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000098 float out00 = d00 + d01 + d02;
99 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100101
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100102 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
103 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
104 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
105 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100106
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100107 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
108 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
109 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
110 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100111
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100112 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
113 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
114 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
115 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000118 float k0 = d01 + d11 + d21;
119 float k1 = d02 + d12 + d22;
120 float k2 = d11 - d21 - d31;
121 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100122
123 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
124 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
125 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
126 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
127
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000128 float out00 = d10;
129 float out01 = -d13;
130 float out10 = d10;
131 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100132
133 out00 += d00 + d20 + k0 + k1;
134 out01 += k0 - k1 - (d03 + d23);
135 out10 += -d20 - d30 + k2 + k3;
136 out11 += k2 - k3 + d23 + d33;
137#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
138
139 int y_in = get_global_id(1);
140 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
141 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
142 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000143#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100144 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000145#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
147#if defined(HAS_BIAS)
148 // Add bias
149 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
150
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000151 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100152
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000153 out00 += (float)b;
154 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100155#endif // defined(HAS_BIAS)
156
157 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000158#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100159 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000160#else /* defined(SRC_DEPTH) */
161 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
162#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100163
164 // Store the output tile
165#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100166 const const VEC_DATA_TYPE(DATA_TYPE, 2)
167 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
168 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
169 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100170#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100171 vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100172#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
173
174#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
175#if defined(HAS_BIAS)
176 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100177 out10 += (DATA_TYPE)b;
178 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100179#endif // defined(HAS_BIAS)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100180 vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100181#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
182}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100183#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100184
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100185#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100186/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
187 *
188 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
189 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
190 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
191 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
192 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100193 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100194 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100195 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100196 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
197 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
198 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
199 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
200 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
201 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100202 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
203 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100204 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
205 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
206 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
207 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
208 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
209 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100210 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
211 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
212 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
213 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100214 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
215 */
216__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100217 TENSOR4D_DECLARATION(src),
218 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100219#if defined(HAS_BIAS)
220 ,
221 VECTOR_DECLARATION(bias)
222#endif // defined(HAS_BIAS)
223)
224{
225 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000226#if defined(SRC_DEPTH)
227 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100228 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000229#else /* defined(SRC_DEPTH) */
230 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
231 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
232#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100233
234 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100235 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
236 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
237 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
238 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
239 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
240 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100241
242#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
243 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000244 float out00 = d00 + d01 + d02 + d03 + d04;
245 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
246 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
247 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100248#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100249
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100250 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
251 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
252 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
253 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
254 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
255 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100256
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100257 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
258 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
259 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
260 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
261 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
262 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100263
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100264 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
265 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
266 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
267 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
268 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
269 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100270
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100271 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
272 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
273 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
274 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
275 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
276 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100277
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100278 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
279 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
280 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
281 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
282 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
283 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100284
285 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000286 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
287 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
288 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
289 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100290
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000291 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
292 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100293
294 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
295 out01 += k1 - d02 - d12 - d22 - d32 - d42;
296 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
297 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
298
299 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000300 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
301 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
302 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
303 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100304
305 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
306 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
307
308 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
309 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
310 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
311 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
312
313 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000314 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
315 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
316 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
317 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100318
319 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
320 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
321
322 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
323 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
324 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
325 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
326
327 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000328 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
329 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
330 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
331 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100332
333 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
334 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
335
336 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
337 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
338 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
339 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
340#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
341
342 int y_in = get_global_id(1);
343 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
344 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
345 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000346#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100347 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000348#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100349
350#if defined(HAS_BIAS)
351 // Add bias
352 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
353
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000354 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100355
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000356 out00 += (float)b;
357 out01 += (float)b;
358 out02 += (float)b;
359 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100360#endif // defined(HAS_BIAS)
361
362 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000363#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100364 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000365#else /* defined(SRC_DEPTH) */
366 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
367#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100368
369 // Store the output tile
370#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100371 VEC_DATA_TYPE(DATA_TYPE, 4)
372 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
373 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
374 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
375 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
376 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100377#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100378 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100379#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
380
381#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
382#if defined(HAS_BIAS)
383 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000384 out10 += (float)b;
385 out11 += (float)b;
386 out12 += (float)b;
387 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100388
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000389 out20 += (float)b;
390 out21 += (float)b;
391 out22 += (float)b;
392 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100393
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000394 out30 += (float)b;
395 out31 += (float)b;
396 out32 += (float)b;
397 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100398#endif // defined(HAS_BIAS)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100399 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
400 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
401 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100402#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
403}
404
Giorgio Arena149fdf32018-07-04 17:03:33 +0100405/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100406 *
407 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100408 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
409 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
410 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
411 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100412 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100413 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100414 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100415 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
416 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
417 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
418 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
419 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
420 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100421 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
422 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100423 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
424 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
425 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
426 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
427 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
428 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100429 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
430 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
431 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
432 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100433 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
434 * @param[in] dst_size Size of the destination tensor, minus the last padding
435 */
436__kernel void winograd_output_transform_4x4_3x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100437 TENSOR4D_DECLARATION(src),
438 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100439#if defined(HAS_BIAS)
440 VECTOR_DECLARATION(bias),
441#endif // defined(HAS_BIAS)
442 int dst_size)
443{
Giorgio Arena149fdf32018-07-04 17:03:33 +0100444 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000445#if defined(SRC_DEPTH)
446 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100447 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000448#else /* defined(SRC_DEPTH) */
449 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
450 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
451#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100452
Giorgio Arena149fdf32018-07-04 17:03:33 +0100453 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100454 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
455 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
456 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
457 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
458 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
459 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100460
Giorgio Arena149fdf32018-07-04 17:03:33 +0100461#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
462 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000463 float out00 = d00 + d01 + d02 + d03 + d04;
464 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
465 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
466 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100467#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
468
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100469 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
470 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
471 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
472 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
473 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
474 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100475
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100476 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
477 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
478 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
479 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
480 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
481 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100482
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100483 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
484 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
485 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
486 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
487 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
488 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100489
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100490 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
491 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
492 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
493 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
494 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
495 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100496
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100497 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
498 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
499 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
500 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
501 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
502 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100503
504 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000505 float out00 = d01 + d21 + d41 + d11 + d31;
506 float out01 = d01 + d21 + d41 + d11 + d31;
507 float out02 = d01 + d21 + d41 + d11 + d31;
508 float out03 = d01 + d21 + d41 + d11 + d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100509
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000510 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
511 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100512
513 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
514 out01 += k1 - d02 - d12 - d22 - d32 - d42;
515 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
516 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
517
518 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000519 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
520 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
521 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
522 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100523
524 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
525 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
526
527 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
528 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
529 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
530 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
531
532 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000533 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
534 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
535 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
536 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100537
538 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
539 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
540
541 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
542 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
543 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
544 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
545
546 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000547 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
548 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
549 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
550 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100551
552 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
553 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
554
555 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
556 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
557 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
558 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100559#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100560
561 int y_in = get_global_id(1);
562 int x_out = get_global_id(0);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100563 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
564 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000565#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100566 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000567#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100568
569#if defined(HAS_BIAS)
570 // Add bias
571 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
572
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100573 DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100574
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100575 out00 += (DATA_TYPE)b;
576 out01 += (DATA_TYPE)b;
577 out02 += (DATA_TYPE)b;
578 out03 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100579#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100580 out10 += (DATA_TYPE)b;
581 out11 += (DATA_TYPE)b;
582 out12 += (DATA_TYPE)b;
583 out13 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100584
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100585 out20 += (DATA_TYPE)b;
586 out21 += (DATA_TYPE)b;
587 out22 += (DATA_TYPE)b;
588 out23 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100589
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100590 out30 += (DATA_TYPE)b;
591 out31 += (DATA_TYPE)b;
592 out32 += (DATA_TYPE)b;
593 out33 += (DATA_TYPE)b;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100594#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100595
596#endif // defined(HAS_BIAS)
597
Giorgio Arena149fdf32018-07-04 17:03:33 +0100598#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000599#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100600 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100601#else /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000602 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100603#endif /* defined(SRC_DEPTH) */
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000604 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Giorgio Arena149fdf32018-07-04 17:03:33 +0100605
606 // Store the 1x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100607 VEC_DATA_TYPE(DATA_TYPE, 4)
608 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
609 *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
610 *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
611 *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
612 *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100613#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
614 // Store the 4x1 output tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100615 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Giorgio Arenad02eb452018-07-18 11:45:30 +0100616 int mult_y = min(dst_size - offset, 1);
Giorgio Arena149fdf32018-07-04 17:03:33 +0100617
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100618 VEC_DATA_TYPE(DATA_TYPE, 4)
619 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
620 *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
621 *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
622 *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
623 *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100624#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100625 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000626#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100627 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000628#else /* defined(SRC_DEPTH) */
629 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
630#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +0100631 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100632 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100633
634 // Store the 4x4 output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100635 VEC_DATA_TYPE(DATA_TYPE, 4)
636 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
637 VEC_DATA_TYPE(DATA_TYPE, 4)
638 out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));
639 VEC_DATA_TYPE(DATA_TYPE, 4)
640 out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));
641 VEC_DATA_TYPE(DATA_TYPE, 4)
642 out3_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));
643 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
644 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
645 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
646 *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
647 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
648 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
649 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
650 *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
651 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
652 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
653 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
654 *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
655 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
656 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
657 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
658 *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100659
660#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100661}
662
663#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
664 ({ \
665 comm_fact.s0 = d1 + d2; \
666 comm_fact.s1 = d3 + d4; \
667 comm_fact.s2 = d5 + d6; \
668 \
669 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
670 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
671 \
672 comm_fact.s0 = d1 - d2; \
673 comm_fact.s1 = d3 - d4; \
674 comm_fact.s2 = d5 - d6; \
675 \
676 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
677 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
678 })
679
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100680/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100681 *
682 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100683 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
684 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
685 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
686 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100687 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100688 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100689 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100690 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
691 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
692 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
693 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
694 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
695 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100696 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
697 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100698 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
699 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
700 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
701 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
702 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
703 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100704 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
705 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
706 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
707 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100708 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
709 */
710__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100711 TENSOR4D_DECLARATION(src),
712 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100713#if defined(HAS_BIAS)
714 ,
715 VECTOR_DECLARATION(bias)
716#endif // defined(HAS_BIAS)
717)
718{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100719 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000720#if defined(SRC_DEPTH)
721 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100722 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000723#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100724
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000725 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
726 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
727#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100728
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100729 // Compute output address
730 int y_in = get_global_id(1);
731 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
732 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
733 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000734#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100735 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000736#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100737
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000738#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100739 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000740#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100741
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000742 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
743#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100744
745 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100746 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
747 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
748 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
749 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
750 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
751 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
752 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
753 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100754
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100755#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
756 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000757 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
758 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
759 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
760 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100761
762#if defined(HAS_BIAS)
763 // Add bias
764 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
765
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000766 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100767
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100768 out00 += (DATA_TYPE)b;
769 out01 += (DATA_TYPE)b;
770 out02 += (DATA_TYPE)b;
771 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100772#endif // defined(HAS_BIAS)
773
774 // Store the output tile
775#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100776 VEC_DATA_TYPE(DATA_TYPE, 4)
777 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
778 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
779 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
780 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
781 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100782#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100783 vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100784#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
785
786#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100787
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000788 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
789 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
790 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
791 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
792 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
793 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
794 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
795 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100796
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100797 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
798 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
799 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
800 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
801 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
802 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
803 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
804 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100805
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100806 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
807 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
808 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
809 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
810 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
811 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
812 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
813 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100814
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100815 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
816 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
817 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
818 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
819 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
820 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
821 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
822 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100823
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100824 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
825 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
826 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
827 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
828 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
829 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
830 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
831 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100832
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100833 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
834 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
835 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
836 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
837 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
838 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
839 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
840 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100841
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100842 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
843 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
844 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
845 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
846 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
847 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
848 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
849 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100850
851 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000852 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100853 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000854 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100855 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100856
857 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
858 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
859 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
860 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
861 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
862 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
863 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
864 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
865
866 // Compute the 4x4 output tile
867 comm_fact0 = tmp_col1 + tmp_col2;
868 comm_fact1 = tmp_col3 + tmp_col4;
869 comm_fact2 = tmp_col5 + tmp_col6;
870
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000871 VEC_DATA_TYPE(float, 4)
872 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
873 VEC_DATA_TYPE(float, 4)
874 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100875
876 comm_fact0 = tmp_col1 - tmp_col2;
877 comm_fact1 = tmp_col3 - tmp_col4;
878 comm_fact2 = tmp_col5 - tmp_col6;
879
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000880 VEC_DATA_TYPE(float, 4)
881 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
882 VEC_DATA_TYPE(float, 4)
883 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100884
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100885#if defined(HAS_BIAS)
886 // Add bias
887 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
888
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000889 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100890
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000891 out_col0 += (VEC_DATA_TYPE(float, 4))b;
892 out_col1 += (VEC_DATA_TYPE(float, 4))b;
893 out_col2 += (VEC_DATA_TYPE(float, 4))b;
894 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100895#endif // defined(HAS_BIAS)
896
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100897 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100898 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
899 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
900 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
901 vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100902#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100903}
904
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100905/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100906 *
907 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100908 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
909 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
910 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
911 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100912 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100913 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100914 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100915 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
916 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
917 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
918 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
919 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
920 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100921 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
922 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100923 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
924 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
925 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
926 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
927 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
928 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100929 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
930 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
931 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
932 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100933 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
934 */
935__kernel void winograd_output_transform_4x4_5x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100936 TENSOR4D_DECLARATION(src),
937 TENSOR4D_DECLARATION(dst),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100938#if defined(HAS_BIAS)
939 VECTOR_DECLARATION(bias),
940#endif // defined(HAS_BIAS)
941 int dst_size)
942{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100943 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000944#if defined(SRC_DEPTH)
945 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100946 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000947#else /* defined(SRC_DEPTH) */
948 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
949 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
950#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100951
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100952 int y_in = get_global_id(1);
953 int x_out = get_global_id(0);
954 int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
955 int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000956#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100957 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000958#endif /* defined(SRC_DEPTH) */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100959
960 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100961 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
962 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
963 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
964 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
965 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
966 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
967 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
968 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100969
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100970#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
971 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000972 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
973 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
974 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
975 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100976
977#if defined(HAS_BIAS)
978 // Add bias
979 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
980
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000981 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100982
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000983 out00 += (float)b;
984 out01 += (float)b;
985 out02 += (float)b;
986 out03 += (float)b;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100987#endif // defined(HAS_BIAS)
988
989 // Store the output tile
990#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
991 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000992#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100993 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000994#else /* defined(SRC_DEPTH) */
995 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
996#endif /* defined(SRC_DEPTH) */
997 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100998
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100999 VEC_DATA_TYPE(DATA_TYPE, 4)
1000 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
1001 *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
1002 *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
1003 *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
1004 *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001005#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1006 // Get output address
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001007 int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001008 VEC_DATA_TYPE(DATA_TYPE, 4)
1009 out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
1010 *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
1011 *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
1012 *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
1013 *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001014#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1015
1016#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1017
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001018 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
1019 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
1020 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
1021 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
1022 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
1023 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
1024 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
1025 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001026
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001027 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
1028 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
1029 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
1030 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
1031 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
1032 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
1033 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
1034 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001035
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001036 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
1037 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
1038 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
1039 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
1040 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
1041 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
1042 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
1043 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001044
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001045 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
1046 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
1047 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
1048 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
1049 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
1050 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
1051 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
1052 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001053
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001054 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
1055 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
1056 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
1057 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
1058 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
1059 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
1060 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
1061 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001062
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001063 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
1064 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
1065 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
1066 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
1067 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
1068 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
1069 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
1070 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001071
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001072 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
1073 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
1074 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
1075 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
1076 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
1077 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
1078 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
1079 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001080
1081 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001082 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001083 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001084 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001085 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001086
1087 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
1088 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
1089 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
1090 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
1091 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
1092 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
1093 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
1094 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
1095
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001096 // Compute the output tile
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001097 comm_fact0 = tmp_col1 + tmp_col2;
1098 comm_fact1 = tmp_col3 + tmp_col4;
1099 comm_fact2 = tmp_col5 + tmp_col6;
1100
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001101 VEC_DATA_TYPE(float, 4)
1102 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
1103 VEC_DATA_TYPE(float, 4)
1104 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001105
1106 comm_fact0 = tmp_col1 - tmp_col2;
1107 comm_fact1 = tmp_col3 - tmp_col4;
1108 comm_fact2 = tmp_col5 - tmp_col6;
1109
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001110 VEC_DATA_TYPE(float, 4)
1111 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
1112 VEC_DATA_TYPE(float, 4)
1113 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001114
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001115#if defined(HAS_BIAS)
1116 // Add bias
1117 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
1118
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001119 DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001120
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00001121 out_col0 += (VEC_DATA_TYPE(float, 4))b;
1122 out_col1 += (VEC_DATA_TYPE(float, 4))b;
1123 out_col2 += (VEC_DATA_TYPE(float, 4))b;
1124 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001125#endif // defined(HAS_BIAS)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001126 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001127#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001128 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001129#else /* defined(SRC_DEPTH) */
1130 int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
1131#endif /* defined(SRC_DEPTH) */
Georgios Pinitasb0b37172018-07-20 17:30:56 +01001132 offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
1133 int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001134
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001135 // Store the output tile
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001136 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1137 out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1138 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1139 out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1140 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1141 out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1142 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1143 out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
1144
1145 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
1146 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
1147 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
1148 *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
1149 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
1150 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
1151 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
1152 *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
1153 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
1154 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
1155 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
1156 *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
1157 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
1158 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
1159 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
1160 *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001161#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001162}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001163#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001164
1165#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001166#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001167/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1168 *
1169 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1170 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1171 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1172 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001173 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001174 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001175 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001176 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1177 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1178 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1179 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1180 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1181 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001182 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1183 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001184 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1185 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1186 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1187 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1188 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1189 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001190 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1191 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1192 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1193 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001194 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1195 */
1196__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001197 TENSOR4D_DECLARATION(src),
1198 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001199#if defined(HAS_BIAS)
1200 ,
1201 VECTOR_DECLARATION(bias)
1202#endif // defined(HAS_BIAS)
1203)
1204{
1205 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1206 src_stride_x,
1207 src_step_x,
1208 src_stride_y,
1209 src_step_y,
1210 src_stride_z,
1211 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001212 src_stride_w,
1213 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001214 src_offset_first_element_in_bytes,
1215 dst_ptr,
1216 dst_stride_x,
1217 dst_step_x,
1218 dst_stride_y,
1219 dst_step_y,
1220 dst_stride_z,
1221 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001222 dst_stride_w,
1223 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001224 dst_offset_first_element_in_bytes
1225#if defined(HAS_BIAS)
1226 ,
1227 bias_ptr,
1228 bias_stride_x,
1229 bias_step_x,
1230 bias_offset_first_element_in_bytes
1231#endif // defined(HAS_BIAS)
1232 );
1233}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001234#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001235
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001236#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001237/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1238 *
1239 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1240 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1241 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1242 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001243 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001244 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001245 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001246 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1247 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1248 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1249 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1250 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1251 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001252 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1253 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001254 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1255 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1256 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1257 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1258 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1259 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001260 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1261 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1262 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1263 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001264 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1265 */
1266__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001267 TENSOR4D_DECLARATION(src),
1268 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001269#if defined(HAS_BIAS)
1270 ,
1271 VECTOR_DECLARATION(bias)
1272#endif // defined(HAS_BIAS)
1273)
1274{
1275 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1276 src_stride_x,
1277 src_step_x,
1278 src_stride_y,
1279 src_step_y,
1280 src_stride_z,
1281 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001282 src_stride_w,
1283 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001284 src_offset_first_element_in_bytes,
1285 dst_ptr,
1286 dst_stride_x,
1287 dst_step_x,
1288 dst_stride_y,
1289 dst_step_y,
1290 dst_stride_z,
1291 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001292 dst_stride_w,
1293 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001294 dst_offset_first_element_in_bytes
1295#if defined(HAS_BIAS)
1296 ,
1297 bias_ptr,
1298 bias_stride_x,
1299 bias_step_x,
1300 bias_offset_first_element_in_bytes
1301#endif // defined(HAS_BIAS)
1302 );
1303}
1304
1305/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1306 *
1307 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1308 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1309 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1310 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001311 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001312 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001313 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001314 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1315 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1316 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1317 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1318 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1319 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001320 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1321 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001322 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1323 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1324 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1325 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1326 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1327 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001328 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1329 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1330 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1331 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001332 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1333 */
1334__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001335 TENSOR4D_DECLARATION(src),
1336 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001337#if defined(HAS_BIAS)
1338 ,
1339 VECTOR_DECLARATION(bias)
1340#endif // defined(HAS_BIAS)
1341)
1342{
1343 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1344 src_stride_x,
1345 src_step_x,
1346 src_stride_y,
1347 src_step_y,
1348 src_stride_z,
1349 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001350 src_stride_w,
1351 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001352 src_offset_first_element_in_bytes,
1353 dst_ptr,
1354 dst_stride_x,
1355 dst_step_x,
1356 dst_stride_y,
1357 dst_step_y,
1358 dst_stride_z,
1359 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001360 dst_stride_w,
1361 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001362 dst_offset_first_element_in_bytes
1363#if defined(HAS_BIAS)
1364 ,
1365 bias_ptr,
1366 bias_stride_x,
1367 bias_step_x,
1368 bias_offset_first_element_in_bytes
1369#endif // defined(HAS_BIAS)
1370 );
1371}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001372
1373/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1374 *
1375 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1376 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1377 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1378 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001379 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001380 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001381 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1383 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1384 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1385 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1386 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1387 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001388 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1389 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001390 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1391 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1392 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1393 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1394 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1395 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001396 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1397 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1398 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1399 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001400 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1401 */
1402__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001403 TENSOR4D_DECLARATION(src),
1404 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001405#if defined(HAS_BIAS)
1406 VECTOR_DECLARATION(bias),
1407#endif // defined(HAS_BIAS)
1408 int dst_size)
1409{
1410 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1411 src_stride_x,
1412 src_step_x,
1413 src_stride_y,
1414 src_step_y,
1415 src_stride_z,
1416 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001417 src_stride_w,
1418 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001419 src_offset_first_element_in_bytes,
1420 dst_ptr,
1421 dst_stride_x,
1422 dst_step_x,
1423 dst_stride_y,
1424 dst_step_y,
1425 dst_stride_z,
1426 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001427 dst_stride_w,
1428 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001429 dst_offset_first_element_in_bytes,
1430#if defined(HAS_BIAS)
1431 bias_ptr,
1432 bias_stride_x,
1433 bias_step_x,
1434 bias_offset_first_element_in_bytes,
1435#endif // defined(HAS_BIAS)
1436 dst_size);
1437}
1438
1439/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1440 *
1441 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1442 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1443 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1444 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001445 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001446 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001447 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001448 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1449 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1450 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1451 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1452 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1453 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001454 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1455 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001456 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1457 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1458 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1459 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1460 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1461 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001462 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1463 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1464 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1465 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001466 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1467 */
1468__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001469 TENSOR4D_DECLARATION(src),
1470 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001471#if defined(HAS_BIAS)
1472 VECTOR_DECLARATION(bias),
1473#endif // defined(HAS_BIAS)
1474 int dst_size)
1475{
1476 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1477 src_stride_x,
1478 src_step_x,
1479 src_stride_y,
1480 src_step_y,
1481 src_stride_z,
1482 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001483 src_stride_w,
1484 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001485 src_offset_first_element_in_bytes,
1486 dst_ptr,
1487 dst_stride_x,
1488 dst_step_x,
1489 dst_stride_y,
1490 dst_step_y,
1491 dst_stride_z,
1492 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001493 dst_stride_w,
1494 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001495 dst_offset_first_element_in_bytes,
1496#if defined(HAS_BIAS)
1497 bias_ptr,
1498 bias_stride_x,
1499 bias_step_x,
1500 bias_offset_first_element_in_bytes,
1501#endif // defined(HAS_BIAS)
1502 dst_size);
1503}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001504#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001505#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1506
1507#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001508#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001509/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1510 *
1511 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1512 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1513 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1514 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001515 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001516 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001517 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001518 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1519 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1520 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1521 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1522 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1523 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001524 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1525 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001526 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1527 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1528 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1529 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1530 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1531 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001532 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1533 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1534 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1535 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001536 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1537 */
1538__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001539 TENSOR4D_DECLARATION(src),
1540 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001541#if defined(HAS_BIAS)
1542 ,
1543 VECTOR_DECLARATION(bias)
1544#endif // defined(HAS_BIAS)
1545)
1546{
1547 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1548 src_stride_x,
1549 src_step_x,
1550 src_stride_y,
1551 src_step_y,
1552 src_stride_z,
1553 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001554 src_stride_w,
1555 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001556 src_offset_first_element_in_bytes,
1557 dst_ptr,
1558 dst_stride_x,
1559 dst_step_x,
1560 dst_stride_y,
1561 dst_step_y,
1562 dst_stride_z,
1563 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001564 dst_stride_w,
1565 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001566 dst_offset_first_element_in_bytes
1567#if defined(HAS_BIAS)
1568 ,
1569 bias_ptr,
1570 bias_stride_x,
1571 bias_step_x,
1572 bias_offset_first_element_in_bytes
1573#endif // defined(HAS_BIAS)
1574 );
1575}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001576#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001577
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001578#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001579/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1580 *
1581 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1582 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1583 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1584 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001585 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001586 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001587 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001588 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1589 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1590 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1591 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1592 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1593 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001594 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1595 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001596 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1597 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1598 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1599 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1600 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1601 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001602 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1603 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1604 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1605 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001606 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1607 */
1608__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001609 TENSOR4D_DECLARATION(src),
1610 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001611#if defined(HAS_BIAS)
1612 ,
1613 VECTOR_DECLARATION(bias)
1614#endif // defined(HAS_BIAS)
1615)
1616{
1617 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1618 src_stride_x,
1619 src_step_x,
1620 src_stride_y,
1621 src_step_y,
1622 src_stride_z,
1623 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001624 src_stride_w,
1625 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001626 src_offset_first_element_in_bytes,
1627 dst_ptr,
1628 dst_stride_x,
1629 dst_step_x,
1630 dst_stride_y,
1631 dst_step_y,
1632 dst_stride_z,
1633 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001634 dst_stride_w,
1635 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001636 dst_offset_first_element_in_bytes
1637#if defined(HAS_BIAS)
1638 ,
1639 bias_ptr,
1640 bias_stride_x,
1641 bias_step_x,
1642 bias_offset_first_element_in_bytes
1643#endif // defined(HAS_BIAS)
1644 );
1645}
1646
1647/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1648 *
1649 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1650 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1651 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1652 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001653 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001654 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001655 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001656 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1657 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1658 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1659 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1660 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1661 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001662 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1663 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001664 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1665 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1666 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1667 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1668 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1669 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001670 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1671 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1672 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1673 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001674 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1675 */
1676__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001677 TENSOR4D_DECLARATION(src),
1678 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001679#if defined(HAS_BIAS)
1680 ,
1681 VECTOR_DECLARATION(bias)
1682#endif // defined(HAS_BIAS)
1683)
1684{
1685 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1686 src_stride_x,
1687 src_step_x,
1688 src_stride_y,
1689 src_step_y,
1690 src_stride_z,
1691 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001692 src_stride_w,
1693 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001694 src_offset_first_element_in_bytes,
1695 dst_ptr,
1696 dst_stride_x,
1697 dst_step_x,
1698 dst_stride_y,
1699 dst_step_y,
1700 dst_stride_z,
1701 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001702 dst_stride_w,
1703 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001704 dst_offset_first_element_in_bytes
1705#if defined(HAS_BIAS)
1706 ,
1707 bias_ptr,
1708 bias_stride_x,
1709 bias_step_x,
1710 bias_offset_first_element_in_bytes
1711#endif // defined(HAS_BIAS)
1712 );
1713}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001714
1715/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
1716 *
1717 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1718 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1719 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1720 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001721 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001722 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001723 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001724 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1725 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1726 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1727 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1728 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1729 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001730 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1731 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001732 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1733 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1734 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1735 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1736 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1737 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001738 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1739 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1740 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1741 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001742 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1743 */
1744__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001745 TENSOR4D_DECLARATION(src),
1746 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001747#if defined(HAS_BIAS)
1748 VECTOR_DECLARATION(bias),
1749#endif // defined(HAS_BIAS)
1750 int dst_size)
1751{
1752 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1753 src_stride_x,
1754 src_step_x,
1755 src_stride_y,
1756 src_step_y,
1757 src_stride_z,
1758 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001759 src_stride_w,
1760 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001761 src_offset_first_element_in_bytes,
1762 dst_ptr,
1763 dst_stride_x,
1764 dst_step_x,
1765 dst_stride_y,
1766 dst_step_y,
1767 dst_stride_z,
1768 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001769 dst_stride_w,
1770 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001771 dst_offset_first_element_in_bytes,
1772#if defined(HAS_BIAS)
1773 bias_ptr,
1774 bias_stride_x,
1775 bias_step_x,
1776 bias_offset_first_element_in_bytes,
1777#endif // defined(HAS_BIAS)
1778 dst_size);
1779}
1780
1781/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1782 *
1783 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1784 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1785 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1786 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001787 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001788 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001789 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001790 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1791 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1792 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1793 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1794 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1795 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001796 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1797 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001798 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1799 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1800 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1801 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1802 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1803 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001804 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1805 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1806 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1807 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001808 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1809 */
1810__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001811 TENSOR4D_DECLARATION(src),
1812 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001813#if defined(HAS_BIAS)
1814 VECTOR_DECLARATION(bias),
1815#endif // defined(HAS_BIAS)
1816 int dst_size)
1817{
1818 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1819 src_stride_x,
1820 src_step_x,
1821 src_stride_y,
1822 src_step_y,
1823 src_stride_z,
1824 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001825 src_stride_w,
1826 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001827 src_offset_first_element_in_bytes,
1828 dst_ptr,
1829 dst_stride_x,
1830 dst_step_x,
1831 dst_stride_y,
1832 dst_step_y,
1833 dst_stride_z,
1834 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001835 dst_stride_w,
1836 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001837 dst_offset_first_element_in_bytes,
1838#if defined(HAS_BIAS)
1839 bias_ptr,
1840 bias_stride_x,
1841 bias_step_x,
1842 bias_offset_first_element_in_bytes,
1843#endif // defined(HAS_BIAS)
1844 dst_size);
1845}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001846#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001847#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001848#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)