blob: 6bd90604e5e49b32e7586a79c878d5be5850e51f [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Gian Marco Iodicea8903c82021-03-24 14:48:22 +00002 * Copyright (c) 2018-2021 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif6a98a6e2019-05-10 17:07:27 +010024#include "activation_float_helpers.h"
Gian Marco Iodicea8903c82021-03-24 14:48:22 +000025#include "helpers.h"
26#include "tile_helpers.h"
Manuel Bottini0d0028c2018-10-02 16:41:52 +010027
Georgios Pinitasffb57a02018-10-29 18:01:52 +000028#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010030/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
36 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010037 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Usama Arif6a98a6e2019-05-10 17:07:27 +010038 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottini0d0028c2018-10-02 16:41:52 +010039 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
40 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010042 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010049 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
50 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +010057 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
60 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 */
63__kernel void winograd_output_transform_2x2_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +010064 TENSOR4D_DECLARATION(src),
65 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010066#if defined(HAS_BIAS)
67 ,
68 VECTOR_DECLARATION(bias)
69#endif // defined(HAS_BIAS)
70)
71{
72 // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
Georgios Pinitasffb57a02018-10-29 18:01:52 +000073#if defined(SRC_DEPTH)
74 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +010075 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +000076#else /* defined(SRC_DEPTH) */
77 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
78 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
79#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010080
81 // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010082 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
83 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
84 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
85 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 // Compute the 2x1 or 1x2 output tile
89 // out00 = d00 + d01 + d02
90 // out01 = d01 - d02 - d03
91
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +000092 float out00 = d00 + d01 + d02;
93 float out01 = d01 - d02 - d03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +010095
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010096 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
97 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
98 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
99 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100101 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
102 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
103 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
104 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100105
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100106 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
107 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
108 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
109 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100110
111 // Compute the 2x2 output tile
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000112 float k0 = d01 + d11 + d21;
113 float k1 = d02 + d12 + d22;
114 float k2 = d11 - d21 - d31;
115 float k3 = d12 - d22 - d32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
117 // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
118 // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
119 // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
120 // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
121
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000122 float out00 = d10;
123 float out01 = -d13;
124 float out10 = d10;
125 float out11 = -d13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100126
127 out00 += d00 + d20 + k0 + k1;
128 out01 += k0 - k1 - (d03 + d23);
129 out10 += -d20 - d30 + k2 + k3;
130 out11 += k2 - k3 + d23 + d33;
131#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132
133 int y_in = get_global_id(1);
134 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
135 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
136 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000137#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100138 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000139#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100140
141#if defined(HAS_BIAS)
142 // Add bias
143 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
144
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000145 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100146
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000147 out00 += (float)b;
148 out01 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100149#endif // defined(HAS_BIAS)
150
151 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000152#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100153 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#else /* defined(SRC_DEPTH) */
155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
156#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100157
158 // Store the output tile
159#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Michalis Spyrouef6ec502020-07-31 11:38:36 +0100160 const VEC_DATA_TYPE(DATA_TYPE, 2)
Giorgio Arenad056e572020-10-12 11:53:51 +0100161 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100162 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
163 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100164#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100165 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100166 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100167#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
168
169#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
170#if defined(HAS_BIAS)
171 // Add bias
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 out10 += (DATA_TYPE)b;
173 out11 += (DATA_TYPE)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100174#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100175 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100176 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100177#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
178}
giuros013bfacb22019-04-01 12:07:02 +0100179
giuros013bfacb22019-04-01 12:07:02 +0100180/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
181 *
182 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
183 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
184 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Giorgio Arena83eee192021-04-08 16:41:51 +0100185 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100186 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
187 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +0100188 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
189 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
190 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arena83eee192021-04-08 16:41:51 +0100191 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
giuros013bfacb22019-04-01 12:07:02 +0100192 *
193 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
194 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
195 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
196 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
197 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
198 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
199 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
200 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
201 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
202 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
203 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
204 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
205 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
206 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
207 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
208 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
209 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
210 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
211 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
212 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
213 */
214__kernel void winograd_output_transform_2x2_7x7_nhwc(
Giorgio Arena83eee192021-04-08 16:41:51 +0100215 TENSOR4D(src, BUFFER),
216 TENSOR4D(dst, BUFFER),
giuros013bfacb22019-04-01 12:07:02 +0100217#if defined(HAS_BIAS)
218 VECTOR_DECLARATION(bias),
219#endif // defined(HAS_BIAS)
220 int dst_size)
221{
Giorgio Arena83eee192021-04-08 16:41:51 +0100222#define _ISRC_HEIGHT SRC_HEIGHT
223#define _IDST_WIDTH DST_WIDTH
224#define _IDST_HEIGHT DST_HEIGHT
225#define _INUM_TILES_X NUM_TILES_X
giuros013bfacb22019-04-01 12:07:02 +0100226
Giorgio Arena83eee192021-04-08 16:41:51 +0100227 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
228 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
229 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
giuros013bfacb22019-04-01 12:07:02 +0100230
Giorgio Arena83eee192021-04-08 16:41:51 +0100231 int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
232 int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
giuros013bfacb22019-04-01 12:07:02 +0100233
234#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena83eee192021-04-08 16:41:51 +0100235 TILE(DATA_TYPE, 8, N0, in);
236 TILE(DATA_TYPE, 2, N0, out);
237 TILE(uint, 8, 1, src_indirect_y);
238
239 // Calculate the indirect Y for the source tensor
240 LOOP_UNROLLING(int, i, 0, 8, 1)
241 {
242 src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
243 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
244 }
245
246 // Load the values across the 8 channels to compose the 8x1 tile
247 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
248
249 // Compute out0 and out01
250 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
251 out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
giuros013bfacb22019-04-01 12:07:02 +0100252
253#if defined(HAS_BIAS)
254 // Add bias
Giorgio Arena83eee192021-04-08 16:41:51 +0100255 TILE(DATA_TYPE, 1, N0, b);
giuros013bfacb22019-04-01 12:07:02 +0100256
Giorgio Arena83eee192021-04-08 16:41:51 +0100257 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);
giuros013bfacb22019-04-01 12:07:02 +0100258
Giorgio Arena83eee192021-04-08 16:41:51 +0100259 T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
giuros013bfacb22019-04-01 12:07:02 +0100260#endif // defined(HAS_BIAS)
261
Giorgio Arena83eee192021-04-08 16:41:51 +0100262 T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
263
264 TILE(uint, 2, 1, dst_indirect_y);
265
giuros013bfacb22019-04-01 12:07:02 +0100266#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena83eee192021-04-08 16:41:51 +0100267 LOOP_UNROLLING(int, yk, 0, 2, 1)
268 {
269 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
270 dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
271 }
giuros013bfacb22019-04-01 12:07:02 +0100272#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena83eee192021-04-08 16:41:51 +0100273 LOOP_UNROLLING(int, xk, 0, 2, 1)
274 {
275 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
276 dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
277 }
giuros013bfacb22019-04-01 12:07:02 +0100278#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
279
Giorgio Arena83eee192021-04-08 16:41:51 +0100280 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
281 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
282
giuros013bfacb22019-04-01 12:07:02 +0100283#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
284
Giorgio Arena83eee192021-04-08 16:41:51 +0100285 TILE(DATA_TYPE, 64, N0, in);
286 TILE(DATA_TYPE, 4, N0, out);
287 TILE(DATA_TYPE, 16, N0, tmp);
288 TILE(uint, 64, 1, src_indirect_y);
giuros013bfacb22019-04-01 12:07:02 +0100289
Giorgio Arena83eee192021-04-08 16:41:51 +0100290 // Calculate the indirect Y for the source tensor
291 LOOP_UNROLLING(int, i, 0, 64, 1)
292 {
293 src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
294 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
295 }
giuros013bfacb22019-04-01 12:07:02 +0100296
Giorgio Arena83eee192021-04-08 16:41:51 +0100297 // Load the values across the 64 channels to compose the 8x8 tile
298 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
giuros013bfacb22019-04-01 12:07:02 +0100299
Giorgio Arena83eee192021-04-08 16:41:51 +0100300 LOOP_UNROLLING(int, i, 0, 8, 1)
301 {
302 tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
303 tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
304 }
giuros013bfacb22019-04-01 12:07:02 +0100305
306 // Compute the 2x2 output tile
Giorgio Arena83eee192021-04-08 16:41:51 +0100307 LOOP_UNROLLING(int, i, 0, 2, 1)
308 {
309 out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
310 out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
311 }
giuros013bfacb22019-04-01 12:07:02 +0100312
313#if defined(HAS_BIAS)
314 // Add bias
Giorgio Arena83eee192021-04-08 16:41:51 +0100315 TILE(DATA_TYPE, 1, N0, b);
giuros013bfacb22019-04-01 12:07:02 +0100316
Giorgio Arena83eee192021-04-08 16:41:51 +0100317 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b);
giuros013bfacb22019-04-01 12:07:02 +0100318
Giorgio Arena83eee192021-04-08 16:41:51 +0100319 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
giuros013bfacb22019-04-01 12:07:02 +0100320#endif // defined(HAS_BIAS)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100321
Giorgio Arena83eee192021-04-08 16:41:51 +0100322 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
giuros013bfacb22019-04-01 12:07:02 +0100323
Giorgio Arena83eee192021-04-08 16:41:51 +0100324 TILE(uint, 4, 1, dst_indirect_y);
giuros013bfacb22019-04-01 12:07:02 +0100325
Giorgio Arena83eee192021-04-08 16:41:51 +0100326 // Calculate the destination indirect Y
327 LOOP_UNROLLING(int, yk, 0, 2, 1)
328 {
329 LOOP_UNROLLING(int, xk, 0, 2, 1)
330 {
331 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
332 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
333 dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;
334 dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
335 }
336 }
giuros013bfacb22019-04-01 12:07:02 +0100337
Giorgio Arena83eee192021-04-08 16:41:51 +0100338 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
339 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
giuros013bfacb22019-04-01 12:07:02 +0100340#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
341}
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100342#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100343
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100344#if defined(VEC_SIZE) && VEC_SIZE == 4
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100345/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
346 *
347 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
348 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
349 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
350 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
351 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100352 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100353 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100354 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100355 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
356 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
357 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
358 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
359 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
360 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100361 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
362 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100363 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
364 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
365 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
366 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
367 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
368 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100369 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
370 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
371 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
372 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100373 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
374 */
375__kernel void winograd_output_transform_4x4_3x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100376 TENSOR4D_DECLARATION(src),
377 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100378#if defined(HAS_BIAS)
379 ,
380 VECTOR_DECLARATION(bias)
381#endif // defined(HAS_BIAS)
382)
383{
384 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000385#if defined(SRC_DEPTH)
386 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100387 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000388#else /* defined(SRC_DEPTH) */
Giorgio Arena83eee192021-04-08 16:41:51 +0100389 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
390 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000391#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100392
393 // Load the values across the channels to compose the 6x6 or 6x1 tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100394 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
395 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
396 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
397 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
398 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
399 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100400
401#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
402 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000403 float out00 = d00 + d01 + d02 + d03 + d04;
404 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
405 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
406 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100407#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100408
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100409 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
410 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
411 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
412 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
413 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
414 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100415
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100416 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
417 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
418 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
419 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
420 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
421 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100422
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100423 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
424 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
425 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
426 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
427 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
428 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100429
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100430 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
431 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
432 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
433 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
434 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
435 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100436
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100437 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
438 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
439 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
440 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
441 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
442 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100443
444 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000445 float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
446 float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
447 float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
448 float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100449
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000450 float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
451 float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100452
453 out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
454 out01 += k1 - d02 - d12 - d22 - d32 - d42;
455 out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
456 out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
457
458 // Compute out10, out11, out12 and out13
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000459 float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
460 float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
461 float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
462 float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100463
464 k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
465 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
466
467 out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
468 out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
469 out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
470 out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
471
472 // Compute out20, out21, out22 and out23
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000473 float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
474 float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
475 float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
476 float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100477
478 k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
479 k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
480
481 out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
482 out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
483 out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
484 out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
485
486 // Compute out30, out31, out32 and out33
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000487 float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
488 float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
489 float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
490 float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100491
492 k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
493 k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
494
495 out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
496 out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
497 out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
498 out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
499#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
500
501 int y_in = get_global_id(1);
502 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
503 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
504 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000505#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100506 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000507#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100508
509#if defined(HAS_BIAS)
510 // Add bias
511 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
512
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000513 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100514
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000515 out00 += (float)b;
516 out01 += (float)b;
517 out02 += (float)b;
518 out03 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100519#endif // defined(HAS_BIAS)
520
521 // Get output address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000522#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100523 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000524#else /* defined(SRC_DEPTH) */
525 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
526#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100527
528 // Store the output tile
529#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100530 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100531 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
532 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100533 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
534 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
535 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
536 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100537#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100538 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100539 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100540#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
541
542#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
543#if defined(HAS_BIAS)
544 // Add bias
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000545 out10 += (float)b;
546 out11 += (float)b;
547 out12 += (float)b;
548 out13 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100549
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000550 out20 += (float)b;
551 out21 += (float)b;
552 out22 += (float)b;
553 out23 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100554
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000555 out30 += (float)b;
556 out31 += (float)b;
557 out32 += (float)b;
558 out33 += (float)b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100559#endif // defined(HAS_BIAS)
Giorgio Arenad056e572020-10-12 11:53:51 +0100560 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100561 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100562 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100563 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100564 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100565 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100566#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
567}
568
Giorgio Arena149fdf32018-07-04 17:03:33 +0100569/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100570 *
571 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Giorgio Arena149fdf32018-07-04 17:03:33 +0100572 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
573 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000574 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100575 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
576 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Giorgio Arena149fdf32018-07-04 17:03:33 +0100577 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
578 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100579 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000580 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100581 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100582 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100583 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
584 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
585 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
586 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
587 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
588 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100589 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
590 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100591 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
592 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
593 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
594 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
595 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
596 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100597 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
598 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
599 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
600 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100601 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
602 * @param[in] dst_size Size of the destination tensor, minus the last padding
603 */
604__kernel void winograd_output_transform_4x4_3x3_nhwc(
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000605 TENSOR4D(src, BUFFER),
606 TENSOR4D(dst, BUFFER),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100607#if defined(HAS_BIAS)
608 VECTOR_DECLARATION(bias),
609#endif // defined(HAS_BIAS)
610 int dst_size)
611{
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000612 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
613 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
614 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100615
Giorgio Arena149fdf32018-07-04 17:03:33 +0100616#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100617
Giorgio Arena83eee192021-04-08 16:41:51 +0100618 TILE(DATA_TYPE, 6, N0, in) = { { { 0 } } };
619 TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } };
620 TILE(uint, 6, 1, src_indirect_y) = { { { 0 } } };
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100621
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000622 LOOP_UNROLLING(int, i, 0, 6, 1)
623 {
624 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
625 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
626 }
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100627
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000628 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
629 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100630
631 // Compute out00, out01, out02 and out03
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000632 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
633 out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
634 out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
635 out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100636
637#if defined(HAS_BIAS)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000638 TILE(DATA_TYPE, 1, N0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100639
Gian Marco Iodice0b76f7d2021-04-08 17:20:00 +0100640 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100641
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000642 // c = c + bias[broadcasted]
643 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
644#endif // HAS_BIAS
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100645
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000646 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
647 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100648
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000649 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100650
Giorgio Arena83eee192021-04-08 16:41:51 +0100651 TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } };
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100652
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000653 // Calculate the destination indirect Y
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100654#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000655 LOOP_UNROLLING(int, yk, 0, 4, 1)
656 {
657 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
658 dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
659 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
660 }
Giorgio Arena83eee192021-04-08 16:41:51 +0100661#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000662 LOOP_UNROLLING(int, xk, 0, 4, 1)
663 {
664 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
665 dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
666 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
667 }
668#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100669
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000670 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
671 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100672
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000673#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100674
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000675 // Calculate the indirect Y for the source tensor
Giorgio Arena83eee192021-04-08 16:41:51 +0100676 TILE(DATA_TYPE, 36, N0, in) = { { { 0 } } };
677 TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } };
678 TILE(uint, 36, 1, src_indirect_y) = { { { 0 } } };
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100679
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000680 LOOP_UNROLLING(int, i, 0, 36, 1)
681 {
682 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
683 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
684 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100685
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000686 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
687 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100688
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000689 LOOP_UNROLLING(int, i, 0, 6, 1)
690 {
691 tmp[0].v = in[6 + i].v + in[12 + i].v;
692 tmp[1].v = in[6 + i].v - in[12 + i].v;
693 tmp[2].v = in[18 + i].v + in[24 + i].v;
694 tmp[3].v = in[18 + i].v - in[24 + i].v;
695 tmp[3].v = tmp[3].v + tmp[3].v;
696 in[i].v = in[i].v + tmp[0].v + tmp[2].v;
697 in[6 + i].v = tmp[3].v + tmp[1].v;
698 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
699 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
700 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100701
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000702 // Compute the output tile
Giorgio Arena83eee192021-04-08 16:41:51 +0100703 TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } };
Giorgio Arena149fdf32018-07-04 17:03:33 +0100704
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000705 LOOP_UNROLLING(int, i, 0, 4, 1)
706 {
707 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
708 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
709 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
710 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
711 tmp[3].v = tmp[3].v + tmp[3].v;
712 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
713 out[4 * i + 1].v = tmp[3].v + tmp[1].v;
714 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
715 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
716 }
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100717
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000718#if defined(HAS_BIAS)
719 TILE(DATA_TYPE, 1, N0, b);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100720
Gian Marco Iodice0b76f7d2021-04-08 17:20:00 +0100721 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
Gian Marco Iodice5f910412020-10-20 09:14:45 +0100722
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000723 // c = c + bias[broadcasted]
724 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
725#endif // HAS_BIAS
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100726
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000727 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
728 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arena149fdf32018-07-04 17:03:33 +0100729
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000730 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
731
Giorgio Arena83eee192021-04-08 16:41:51 +0100732 TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } };
Gian Marco Iodicea8903c82021-03-24 14:48:22 +0000733
734 // Calculate the destination indirect Y
735 LOOP_UNROLLING(int, yk, 0, 4, 1)
736 {
737 LOOP_UNROLLING(int, xk, 0, 4, 1)
738 {
739 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
740 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
741 dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
742 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
743 }
744 }
745
746 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
747 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
748#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100749}
750
751#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
752 ({ \
753 comm_fact.s0 = d1 + d2; \
754 comm_fact.s1 = d3 + d4; \
755 comm_fact.s2 = d5 + d6; \
756 \
757 col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
758 col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
759 \
760 comm_fact.s0 = d1 - d2; \
761 comm_fact.s1 = d3 - d4; \
762 comm_fact.s2 = d5 - d6; \
763 \
764 col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
765 col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
766 })
767
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100768/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100769 *
770 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100771 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
772 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
773 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
774 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100775 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100776 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100777 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100778 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
779 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
780 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
781 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
782 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
783 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100784 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
785 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100786 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
787 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
788 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
789 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
790 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
791 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100792 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
793 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
794 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
795 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100796 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
797 */
798__kernel void winograd_output_transform_4x4_5x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100799 TENSOR4D_DECLARATION(src),
800 TENSOR4D_DECLARATION(dst)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100801#if defined(HAS_BIAS)
802 ,
803 VECTOR_DECLARATION(bias)
804#endif // defined(HAS_BIAS)
805)
806{
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100807 // Each thread stores a 4x4/4x1 or 1x4 tile
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000808#if defined(SRC_DEPTH)
809 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100810 const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000811#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100812
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000813 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
814 const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
815#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100816
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100817 // Compute output address
818 int y_in = get_global_id(1);
819 int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
820 int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
821 int z_out = get_global_id(0);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000822#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100823 int batch = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000824#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100825
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000826#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100827 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000828#else /* defined(SRC_DEPTH) */
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100829
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000830 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
831#endif /* defined(SRC_DEPTH) */
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100832
833 // Load the values across the channels to compose the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100834 DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
835 DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
836 DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
837 DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
838 DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
839 DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
840 DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
841 DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100842
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100843#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
844 // Compute out00, out01, out02 and out03
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000845 float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
846 float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
847 float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
848 float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100849
850#if defined(HAS_BIAS)
851 // Add bias
852 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
853
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000854 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100855
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100856 out00 += (DATA_TYPE)b;
857 out01 += (DATA_TYPE)b;
858 out02 += (DATA_TYPE)b;
859 out03 += (DATA_TYPE)b;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100860#endif // defined(HAS_BIAS)
861
862 // Store the output tile
863#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100864 VEC_DATA_TYPE(DATA_TYPE, 4)
Giorgio Arenad056e572020-10-12 11:53:51 +0100865 out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
866 B_VAL);
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100867 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
868 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
869 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
870 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100871#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenad056e572020-10-12 11:53:51 +0100872 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100873 (__global DATA_TYPE *)(dst_addr));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100874#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
875
876#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100877
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000878 DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
879 DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
880 DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
881 DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
882 DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
883 DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
884 DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
885 DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100886
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100887 DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
888 DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
889 DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
890 DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
891 DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
892 DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
893 DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
894 DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100895
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100896 DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
897 DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
898 DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
899 DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
900 DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
901 DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
902 DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
903 DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100904
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100905 DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
906 DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
907 DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
908 DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
909 DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
910 DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
911 DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
912 DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100913
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100914 DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
915 DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
916 DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
917 DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
918 DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
919 DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
920 DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
921 DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100922
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100923 DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
924 DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
925 DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
926 DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
927 DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
928 DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
929 DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
930 DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100931
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100932 DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
933 DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
934 DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
935 DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
936 DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
937 DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
938 DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
939 DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100940
941 // Compute the 8x4 intermediate tensor
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000942 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100943 comm_fact0, comm_fact1, comm_fact2;
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000944 VEC_DATA_TYPE(float, 4)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100945 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100946
947 COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
948 COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
949 COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
950 COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
951 COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
952 COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
953 COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
954 COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
955
956 // Compute the 4x4 output tile
957 comm_fact0 = tmp_col1 + tmp_col2;
958 comm_fact1 = tmp_col3 + tmp_col4;
959 comm_fact2 = tmp_col5 + tmp_col6;
960
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000961 VEC_DATA_TYPE(float, 4)
962 out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
963 VEC_DATA_TYPE(float, 4)
964 out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100965
966 comm_fact0 = tmp_col1 - tmp_col2;
967 comm_fact1 = tmp_col3 - tmp_col4;
968 comm_fact2 = tmp_col5 - tmp_col6;
969
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000970 VEC_DATA_TYPE(float, 4)
971 out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
972 VEC_DATA_TYPE(float, 4)
973 out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100974
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100975#if defined(HAS_BIAS)
976 // Add bias
977 Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
978
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000979 float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100980
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +0000981 out_col0 += (VEC_DATA_TYPE(float, 4))b;
982 out_col1 += (VEC_DATA_TYPE(float, 4))b;
983 out_col2 += (VEC_DATA_TYPE(float, 4))b;
984 out_col3 += (VEC_DATA_TYPE(float, 4))b;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100985#endif // defined(HAS_BIAS)
986
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100987 // Store the output tile
Giorgio Arenad056e572020-10-12 11:53:51 +0100988 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100989 (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100990 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100991 (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100992 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100993 (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
Giorgio Arenad056e572020-10-12 11:53:51 +0100994 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
Usama Arif6a98a6e2019-05-10 17:07:27 +0100995 (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100996#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100997}
998
Gian Marco Iodiced28b7512018-07-06 12:59:28 +0100999/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001000 *
1001 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001002 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1003 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001004 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001005 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1006 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001007 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1008 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001009 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001010 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001011 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001012 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001013 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1014 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1015 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1016 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1017 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1018 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001019 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1020 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001021 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1022 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1023 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1024 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1025 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1026 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001027 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1028 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1029 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1030 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001031 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1032 */
Giorgio Arena83eee192021-04-08 16:41:51 +01001033__kernel void winograd_output_transform_4x4_5x5_nhwc(
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001034 TENSOR4D(src, BUFFER),
1035 TENSOR4D(dst, BUFFER),
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001036#if defined(HAS_BIAS)
1037 VECTOR_DECLARATION(bias),
1038#endif // defined(HAS_BIAS)
1039 int dst_size)
1040{
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001041 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
1042 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
1043 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001044
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001045#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001046 TILE(DATA_TYPE, 8, N0, in) = { { { 0 } } };
1047 TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } };
1048 TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } };
1049 TILE(uint, 8, 1, src_indirect_y) = { { { 0 } } };
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001050
1051 LOOP_UNROLLING(int, i, 0, 8, 1)
1052 {
1053 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
1054 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
1055 }
1056
1057 // "in" contains 1x8 or 8x1 tile here
1058 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
1059
1060 // A^T * in, and in this degenerate case out consists of 1 column/row
1061 tmp[0].v = in[1].v - in[2].v;
1062 tmp[1].v = 2.0f * (in[3].v - in[4].v);
1063 tmp[2].v = 2.0f * (in[5].v + in[6].v);
1064 tmp[3].v = in[3].v + in[4].v;
1065 out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
Giorgio Arena83eee192021-04-08 16:41:51 +01001066 out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001067 out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
1068 out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001069
1070#if defined(HAS_BIAS)
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001071 TILE(DATA_TYPE, 1, N0, b);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001072
Gian Marco Iodice0b76f7d2021-04-08 17:20:00 +01001073 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001074
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001075 // c = c + bias[broadcasted]
1076 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
1077#endif // HAS_BIAS
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001078
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001079 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
1080 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
1081
1082 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
1083
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001084 TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } };
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001085
1086 // Calculate the destination indirect Y
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001087#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001088 LOOP_UNROLLING(int, yk, 0, 4, 1)
1089 {
1090 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
1091 dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
1092 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
1093 }
Giorgio Arena83eee192021-04-08 16:41:51 +01001094#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001095 LOOP_UNROLLING(int, xk, 0, 4, 1)
1096 {
1097 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
1098 dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
1099 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
1100 }
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001101#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1102
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001103 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
1104 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
1105
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001106#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001107 // Calculate the indirect Y for the source tensor
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001108 TILE(DATA_TYPE, 64, N0, in) = { { { 0 } } };
1109 TILE(DATA_TYPE, 6, N0, tmp) = { { { 0 } } };
1110 TILE(uint, 64, 1, src_indirect_y) = { { { 0 } } };
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001111
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001112 LOOP_UNROLLING(int, i, 0, 64, 1)
1113 {
1114 src_indirect_y[i].v = mout + i * SRC_HEIGHT;
1115 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
1116 }
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001117
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001118 // "in" here is 8x8 tile
1119 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001120
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001121 // A^T * in
1122 LOOP_UNROLLING(int, i, 0, 8, 1)
1123 {
Giorgio Arena83eee192021-04-08 16:41:51 +01001124 tmp[0].v = in[8 + i].v + in[16 + i].v;
1125 tmp[1].v = in[8 + i].v - in[16 + i].v;
1126 tmp[2].v = in[24 + i].v + in[32 + i].v;
1127 tmp[3].v = in[24 + i].v - in[32 + i].v;
1128 tmp[3].v = tmp[3].v + tmp[3].v;
1129 tmp[4].v = in[40 + i].v + in[48 + i].v;
1130 tmp[4].v = tmp[4].v + tmp[4].v;
1131 tmp[5].v = in[40 + i].v - in[48 + i].v;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001132
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001133 // 4x8 matrix as a result
1134 in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
1135 in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
1136 in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
1137 in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
1138 }
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001139
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001140 // Compute the output tile
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001141 TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } };
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001142
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001143 // in * A, with in = A^T * in as above
1144 LOOP_UNROLLING(int, i, 0, 4, 1)
1145 {
Giorgio Arena83eee192021-04-08 16:41:51 +01001146 tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
1147 tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
1148 tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
1149 tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
1150 tmp[3].v = tmp[3].v + tmp[3].v;
1151 tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
1152 tmp[4].v = tmp[4].v + tmp[4].v;
1153 tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001154
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001155 // 4x4 tile
1156 out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
1157 out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
1158 out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
1159 out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
1160 }
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001161
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001162#if defined(HAS_BIAS)
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001163 TILE(DATA_TYPE, 1, N0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001164
Gian Marco Iodice0b76f7d2021-04-08 17:20:00 +01001165 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001166
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001167 // c = c + bias[broadcasted]
1168 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
1169#endif // HAS_BIAS
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001170
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001171 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
1172 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001173
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001174 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001175
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001176 TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } };
Aleksandr Nikolaev87781082021-04-08 10:06:34 +01001177
1178 // Calculate the destination indirect Y
1179 LOOP_UNROLLING(int, yk, 0, 4, 1)
1180 {
1181 LOOP_UNROLLING(int, xk, 0, 4, 1)
1182 {
1183 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
1184 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
1185 dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
1186 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
1187 }
1188 }
1189
1190 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
1191 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001192#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001193}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001194#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001195
1196#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001197#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001198/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
1199 *
1200 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1201 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1202 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1203 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001204 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001205 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001206 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001207 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1208 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1209 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1210 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1211 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1212 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001213 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1214 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001215 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1216 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1217 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1218 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1219 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1220 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001221 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1222 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1223 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1224 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001225 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1226 */
1227__kernel void winograd_output_transform_2x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001228 TENSOR4D_DECLARATION(src),
1229 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001230#if defined(HAS_BIAS)
1231 ,
1232 VECTOR_DECLARATION(bias)
1233#endif // defined(HAS_BIAS)
1234)
1235{
1236 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1237 src_stride_x,
1238 src_step_x,
1239 src_stride_y,
1240 src_step_y,
1241 src_stride_z,
1242 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001243 src_stride_w,
1244 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001245 src_offset_first_element_in_bytes,
1246 dst_ptr,
1247 dst_stride_x,
1248 dst_step_x,
1249 dst_stride_y,
1250 dst_step_y,
1251 dst_stride_z,
1252 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001253 dst_stride_w,
1254 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001255 dst_offset_first_element_in_bytes
1256#if defined(HAS_BIAS)
1257 ,
1258 bias_ptr,
1259 bias_stride_x,
1260 bias_step_x,
1261 bias_offset_first_element_in_bytes
1262#endif // defined(HAS_BIAS)
1263 );
1264}
giuros013bfacb22019-04-01 12:07:02 +01001265
1266/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
1267 *
1268 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1269 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1270 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001271 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1272 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +01001273 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1274 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1275 *
1276 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1277 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1278 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1279 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1280 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1281 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1282 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1283 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1284 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1285 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1286 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1287 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1288 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1289 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1290 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1291 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1292 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1293 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1294 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1295 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1296 */
1297__kernel void winograd_output_transform_2x1_7x1_nhwc(
1298 TENSOR4D_DECLARATION(src),
1299 TENSOR4D_DECLARATION(dst),
1300#if defined(HAS_BIAS)
1301 VECTOR_DECLARATION(bias),
1302#endif // defined(HAS_BIAS)
1303 int dst_size)
1304{
1305 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1306 src_stride_x,
1307 src_step_x,
1308 src_stride_y,
1309 src_step_y,
1310 src_stride_z,
1311 src_step_z,
1312 src_stride_w,
1313 src_step_w,
1314 src_offset_first_element_in_bytes,
1315 dst_ptr,
1316 dst_stride_x,
1317 dst_step_x,
1318 dst_stride_y,
1319 dst_step_y,
1320 dst_stride_z,
1321 dst_step_z,
1322 dst_stride_w,
1323 dst_step_w,
1324 dst_offset_first_element_in_bytes,
1325#if defined(HAS_BIAS)
1326 bias_ptr,
1327 bias_stride_x,
1328 bias_step_x,
1329 bias_offset_first_element_in_bytes,
1330#endif // defined(HAS_BIAS)
1331 dst_size);
1332}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001333#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001334
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001335#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001336/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
1337 *
1338 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1339 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1340 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1341 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001342 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001343 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001344 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001345 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1346 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1347 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1348 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1349 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1350 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001351 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1352 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001353 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1354 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1355 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1356 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1357 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1358 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001359 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1360 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1361 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1362 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001363 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1364 */
1365__kernel void winograd_output_transform_4x1_3x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001366 TENSOR4D_DECLARATION(src),
1367 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001368#if defined(HAS_BIAS)
1369 ,
1370 VECTOR_DECLARATION(bias)
1371#endif // defined(HAS_BIAS)
1372)
1373{
1374 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1375 src_stride_x,
1376 src_step_x,
1377 src_stride_y,
1378 src_step_y,
1379 src_stride_z,
1380 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001381 src_stride_w,
1382 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001383 src_offset_first_element_in_bytes,
1384 dst_ptr,
1385 dst_stride_x,
1386 dst_step_x,
1387 dst_stride_y,
1388 dst_step_y,
1389 dst_stride_z,
1390 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001391 dst_stride_w,
1392 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001393 dst_offset_first_element_in_bytes
1394#if defined(HAS_BIAS)
1395 ,
1396 bias_ptr,
1397 bias_stride_x,
1398 bias_step_x,
1399 bias_offset_first_element_in_bytes
1400#endif // defined(HAS_BIAS)
1401 );
1402}
1403
1404/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
1405 *
1406 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1407 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1408 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1409 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001410 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001411 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001412 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001413 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1414 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1415 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1416 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1417 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1418 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001419 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1420 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001421 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1422 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1423 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1424 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1425 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1426 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001427 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1428 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1429 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1430 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001431 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1432 */
1433__kernel void winograd_output_transform_4x1_5x1_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001434 TENSOR4D_DECLARATION(src),
1435 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001436#if defined(HAS_BIAS)
1437 ,
1438 VECTOR_DECLARATION(bias)
1439#endif // defined(HAS_BIAS)
1440)
1441{
1442 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1443 src_stride_x,
1444 src_step_x,
1445 src_stride_y,
1446 src_step_y,
1447 src_stride_z,
1448 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001449 src_stride_w,
1450 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001451 src_offset_first_element_in_bytes,
1452 dst_ptr,
1453 dst_stride_x,
1454 dst_step_x,
1455 dst_stride_y,
1456 dst_step_y,
1457 dst_stride_z,
1458 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001459 dst_stride_w,
1460 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001461 dst_offset_first_element_in_bytes
1462#if defined(HAS_BIAS)
1463 ,
1464 bias_ptr,
1465 bias_stride_x,
1466 bias_step_x,
1467 bias_offset_first_element_in_bytes
1468#endif // defined(HAS_BIAS)
1469 );
1470}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001471
1472/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
1473 *
1474 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1475 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1476 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001477 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1478 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001479 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001480 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001481 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001482 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001483 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1484 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1485 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1486 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1487 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1488 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001489 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1490 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001491 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1492 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1493 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1494 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1495 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1496 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001497 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1498 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1499 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1500 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001501 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1502 */
1503__kernel void winograd_output_transform_4x1_3x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001504 TENSOR4D_DECLARATION(src),
1505 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001506#if defined(HAS_BIAS)
1507 VECTOR_DECLARATION(bias),
1508#endif // defined(HAS_BIAS)
1509 int dst_size)
1510{
1511 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1512 src_stride_x,
1513 src_step_x,
1514 src_stride_y,
1515 src_step_y,
1516 src_stride_z,
1517 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001518 src_stride_w,
1519 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001520 src_offset_first_element_in_bytes,
1521 dst_ptr,
1522 dst_stride_x,
1523 dst_step_x,
1524 dst_stride_y,
1525 dst_step_y,
1526 dst_stride_z,
1527 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001528 dst_stride_w,
1529 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001530 dst_offset_first_element_in_bytes,
1531#if defined(HAS_BIAS)
1532 bias_ptr,
1533 bias_stride_x,
1534 bias_step_x,
1535 bias_offset_first_element_in_bytes,
1536#endif // defined(HAS_BIAS)
1537 dst_size);
1538}
1539
1540/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
1541 *
1542 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1543 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1544 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001545 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1546 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001547 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001548 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001549 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001550 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001551 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1552 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1553 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1554 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1555 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1556 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001557 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1558 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001559 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1560 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1561 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1562 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1563 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1564 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001565 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1566 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1567 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1568 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001569 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1570 */
1571__kernel void winograd_output_transform_4x1_5x1_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001572 TENSOR4D_DECLARATION(src),
1573 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001574#if defined(HAS_BIAS)
1575 VECTOR_DECLARATION(bias),
1576#endif // defined(HAS_BIAS)
1577 int dst_size)
1578{
1579 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1580 src_stride_x,
1581 src_step_x,
1582 src_stride_y,
1583 src_step_y,
1584 src_stride_z,
1585 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001586 src_stride_w,
1587 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001588 src_offset_first_element_in_bytes,
1589 dst_ptr,
1590 dst_stride_x,
1591 dst_step_x,
1592 dst_stride_y,
1593 dst_step_y,
1594 dst_stride_z,
1595 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001596 dst_stride_w,
1597 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001598 dst_offset_first_element_in_bytes,
1599#if defined(HAS_BIAS)
1600 bias_ptr,
1601 bias_stride_x,
1602 bias_step_x,
1603 bias_offset_first_element_in_bytes,
1604#endif // defined(HAS_BIAS)
1605 dst_size);
1606}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001607#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001608#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
1609
1610#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001611#if defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001612/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
1613 *
1614 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1615 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1616 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1617 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001618 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001619 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001620 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001621 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1622 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1623 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1624 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1625 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1626 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001627 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1628 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001629 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1630 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1631 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1632 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1633 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1634 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001635 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1636 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1637 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1638 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001639 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1640 */
1641__kernel void winograd_output_transform_1x2_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001642 TENSOR4D_DECLARATION(src),
1643 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001644#if defined(HAS_BIAS)
1645 ,
1646 VECTOR_DECLARATION(bias)
1647#endif // defined(HAS_BIAS)
1648)
1649{
1650 winograd_output_transform_2x2_3x3_nchw(src_ptr,
1651 src_stride_x,
1652 src_step_x,
1653 src_stride_y,
1654 src_step_y,
1655 src_stride_z,
1656 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001657 src_stride_w,
1658 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001659 src_offset_first_element_in_bytes,
1660 dst_ptr,
1661 dst_stride_x,
1662 dst_step_x,
1663 dst_stride_y,
1664 dst_step_y,
1665 dst_stride_z,
1666 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001667 dst_stride_w,
1668 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001669 dst_offset_first_element_in_bytes
1670#if defined(HAS_BIAS)
1671 ,
1672 bias_ptr,
1673 bias_stride_x,
1674 bias_step_x,
1675 bias_offset_first_element_in_bytes
1676#endif // defined(HAS_BIAS)
1677 );
1678}
giuros013bfacb22019-04-01 12:07:02 +01001679
1680/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
1681 *
1682 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1683 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1684 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001685 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1686 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
giuros013bfacb22019-04-01 12:07:02 +01001687 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1688 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1689 *
1690 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1691 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1692 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1693 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1694 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1695 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1696 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1697 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1698 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1699 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1700 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1701 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1702 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1703 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1704 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1705 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1706 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1707 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1708 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1709 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1710 */
1711__kernel void winograd_output_transform_1x2_1x7_nhwc(
1712 TENSOR4D_DECLARATION(src),
1713 TENSOR4D_DECLARATION(dst),
1714#if defined(HAS_BIAS)
1715 VECTOR_DECLARATION(bias),
1716#endif // defined(HAS_BIAS)
1717 int dst_size)
1718{
1719 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
1720 src_stride_x,
1721 src_step_x,
1722 src_stride_y,
1723 src_step_y,
1724 src_stride_z,
1725 src_step_z,
1726 src_stride_w,
1727 src_step_w,
1728 src_offset_first_element_in_bytes,
1729 dst_ptr,
1730 dst_stride_x,
1731 dst_step_x,
1732 dst_stride_y,
1733 dst_step_y,
1734 dst_stride_z,
1735 dst_step_z,
1736 dst_stride_w,
1737 dst_step_w,
1738 dst_offset_first_element_in_bytes,
1739#if defined(HAS_BIAS)
1740 bias_ptr,
1741 bias_stride_x,
1742 bias_step_x,
1743 bias_offset_first_element_in_bytes,
1744#endif // defined(HAS_BIAS)
1745 dst_size);
1746}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001747#endif // defined(VEC_SIZE) && VEC_SIZE == 2
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001748
Manuel Bottini0d0028c2018-10-02 16:41:52 +01001749#if defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001750/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
1751 *
1752 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1753 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1754 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1755 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001756 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001757 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001758 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001759 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1760 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1761 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1762 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1763 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1764 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001765 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1766 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001767 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1768 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1769 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1770 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1771 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1772 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001773 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1774 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1775 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1776 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001777 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1778 */
1779__kernel void winograd_output_transform_1x4_1x3_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001780 TENSOR4D_DECLARATION(src),
1781 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001782#if defined(HAS_BIAS)
1783 ,
1784 VECTOR_DECLARATION(bias)
1785#endif // defined(HAS_BIAS)
1786)
1787{
1788 winograd_output_transform_4x4_3x3_nchw(src_ptr,
1789 src_stride_x,
1790 src_step_x,
1791 src_stride_y,
1792 src_step_y,
1793 src_stride_z,
1794 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001795 src_stride_w,
1796 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001797 src_offset_first_element_in_bytes,
1798 dst_ptr,
1799 dst_stride_x,
1800 dst_step_x,
1801 dst_stride_y,
1802 dst_step_y,
1803 dst_stride_z,
1804 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001805 dst_stride_w,
1806 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001807 dst_offset_first_element_in_bytes
1808#if defined(HAS_BIAS)
1809 ,
1810 bias_ptr,
1811 bias_stride_x,
1812 bias_step_x,
1813 bias_offset_first_element_in_bytes
1814#endif // defined(HAS_BIAS)
1815 );
1816}
1817
1818/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
1819 *
1820 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1821 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1822 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1823 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001824 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001825 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001826 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001827 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1828 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1829 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1830 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1831 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1832 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001833 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1834 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001835 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1836 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1837 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1838 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1839 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1840 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001841 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1842 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1843 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1844 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001845 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1846 */
1847__kernel void winograd_output_transform_1x4_1x5_nchw(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001848 TENSOR4D_DECLARATION(src),
1849 TENSOR4D_DECLARATION(dst)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001850#if defined(HAS_BIAS)
1851 ,
1852 VECTOR_DECLARATION(bias)
1853#endif // defined(HAS_BIAS)
1854)
1855{
1856 winograd_output_transform_4x4_5x5_nchw(src_ptr,
1857 src_stride_x,
1858 src_step_x,
1859 src_stride_y,
1860 src_step_y,
1861 src_stride_z,
1862 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001863 src_stride_w,
1864 src_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001865 src_offset_first_element_in_bytes,
1866 dst_ptr,
1867 dst_stride_x,
1868 dst_step_x,
1869 dst_stride_y,
1870 dst_step_y,
1871 dst_stride_z,
1872 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001873 dst_stride_w,
1874 dst_step_w,
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001875 dst_offset_first_element_in_bytes
1876#if defined(HAS_BIAS)
1877 ,
1878 bias_ptr,
1879 bias_stride_x,
1880 bias_step_x,
1881 bias_offset_first_element_in_bytes
1882#endif // defined(HAS_BIAS)
1883 );
1884}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001885
1886/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
1887 *
1888 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1889 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1890 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001891 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1892 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001893 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001894 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001895 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001896 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001897 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1898 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1899 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1900 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1901 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1902 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001903 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1904 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001905 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1906 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1907 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1908 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1909 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1910 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001911 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1912 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1913 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1914 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001915 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1916 */
1917__kernel void winograd_output_transform_1x4_1x3_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001918 TENSOR4D_DECLARATION(src),
1919 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001920#if defined(HAS_BIAS)
1921 VECTOR_DECLARATION(bias),
1922#endif // defined(HAS_BIAS)
1923 int dst_size)
1924{
1925 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
1926 src_stride_x,
1927 src_step_x,
1928 src_stride_y,
1929 src_step_y,
1930 src_stride_z,
1931 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001932 src_stride_w,
1933 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001934 src_offset_first_element_in_bytes,
1935 dst_ptr,
1936 dst_stride_x,
1937 dst_step_x,
1938 dst_stride_y,
1939 dst_step_y,
1940 dst_stride_z,
1941 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001942 dst_stride_w,
1943 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001944 dst_offset_first_element_in_bytes,
1945#if defined(HAS_BIAS)
1946 bias_ptr,
1947 bias_stride_x,
1948 bias_step_x,
1949 bias_offset_first_element_in_bytes,
1950#endif // defined(HAS_BIAS)
1951 dst_size);
1952}
1953
1954/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1955 *
1956 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
1957 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1958 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice5f910412020-10-20 09:14:45 +01001959 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1960 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001961 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001962 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001963 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001964 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001965 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1966 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1967 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1968 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1969 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1970 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001971 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1972 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001973 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1974 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1975 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1976 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1977 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1978 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001979 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1980 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1981 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1982 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001983 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1984 */
1985__kernel void winograd_output_transform_1x4_1x5_nhwc(
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001986 TENSOR4D_DECLARATION(src),
1987 TENSOR4D_DECLARATION(dst),
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001988#if defined(HAS_BIAS)
1989 VECTOR_DECLARATION(bias),
1990#endif // defined(HAS_BIAS)
1991 int dst_size)
1992{
1993 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1994 src_stride_x,
1995 src_step_x,
1996 src_stride_y,
1997 src_step_y,
1998 src_stride_z,
1999 src_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002000 src_stride_w,
2001 src_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002002 src_offset_first_element_in_bytes,
2003 dst_ptr,
2004 dst_stride_x,
2005 dst_step_x,
2006 dst_stride_y,
2007 dst_step_y,
2008 dst_stride_z,
2009 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002010 dst_stride_w,
2011 dst_step_w,
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002012 dst_offset_first_element_in_bytes,
2013#if defined(HAS_BIAS)
2014 bias_ptr,
2015 bias_stride_x,
2016 bias_step_x,
2017 bias_offset_first_element_in_bytes,
2018#endif // defined(HAS_BIAS)
2019 dst_size);
2020}
Manuel Bottini0d0028c2018-10-02 16:41:52 +01002021#endif // defined(VEC_SIZE) && VEC_SIZE == 4
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002022#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002023#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)