blob: 9eb995fbb2287de1ac968b2a929ee3f89ecce00e [file] [log] [blame]
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001/*
Gian Marco Iodice905a3c12023-04-14 12:20:58 +01002 * Copyright (c) 2018-2023 Arm Limited.
Adnan AlSinan7075fe22021-07-05 13:12:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "activation_float_helpers.h"
25#include "helpers.h"
26#include "tile_helpers.h"
27
ramelg01bb6877a2022-02-08 09:38:17 +000028#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +000030#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010031/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
32 *
ramelg01bb6877a2022-02-08 09:38:17 +000033 * @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Adnan AlSinan7075fe22021-07-05 13:12:52 +010034 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
35 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Adnan AlSinan7075fe22021-07-05 13:12:52 +010036 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
37 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
38 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
39 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
40 *
41 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
42 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
43 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
44 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
45 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
46 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
47 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
48 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
49 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
50 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
51 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
52 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
53 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
54 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
55 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
56 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
57 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
58 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
59 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
60 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +000061 * @param[in] _ISRC_HEIGHT The source tensor's height
62 * @param[in] _IDST_WIDTH The destination tensor's width
63 * @param[in] _IDST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +010064 */
65__kernel void winograd_output_transform_2x2_7x7_nhwc(
66 TENSOR4D(src, BUFFER),
67 TENSOR4D(dst, BUFFER),
68#if defined(HAS_BIAS)
69 VECTOR_DECLARATION(bias),
70#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +000071 int dst_size,
72 const int _ISRC_HEIGHT,
73 const int _IDST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +010074 const int _IDST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010075{
Adnan AlSinan7075fe22021-07-05 13:12:52 +010076 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
77 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
Gian Marco Iodice905a3c12023-04-14 12:20:58 +010078#if defined(IS_BATCHED)
79 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
80#else // defined(IS_BATCHED)
81 const int bout = 0; // BATCH SIZE IDX
82#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010083
Gian Marco Iodice0c687042022-06-14 15:13:16 +010084 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
85 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Adnan AlSinan7075fe22021-07-05 13:12:52 +010086
87#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
88 TILE(DATA_TYPE, 8, N0, in);
89 TILE(DATA_TYPE, 2, N0, out);
90 TILE(uint, 8, 1, src_indirect_y);
91
92 // Calculate the indirect Y for the source tensor
93 LOOP_UNROLLING(int, i, 0, 1, 8,
94 {
95 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
96 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
97 })
98
99 // Initialize the input tile
100 LOOP_UNROLLING(int, i, 0, 1, 8,
101 {
102 in[i].v = 0;
103 })
104
105 // Load the values across the 8 channels to compose the 8x1 tile
106 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
107
108 // Compute out0 and out01
109 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100110 out[1].v = -in[1].v + in[2].v - (DATA_TYPE)2.f * in[3].v + (DATA_TYPE)2.0f * in[4].v - (DATA_TYPE)3.0f * in[5].v + (DATA_TYPE)3.0f * in[6].v + in[7].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100111
112#if defined(HAS_BIAS)
113 // Add bias
114 TILE(DATA_TYPE, 1, N0, b);
115
116 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
117
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100118 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100119#endif // defined(HAS_BIAS)
120
121 T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
122
123 TILE(uint, 2, 1, dst_indirect_y);
124
125#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
126 LOOP_UNROLLING(int, yk, 0, 1, 2,
127 {
128 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
129 dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
130 })
131#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
132 LOOP_UNROLLING(int, xk, 0, 1, 2,
133 {
134 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
135 dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
136 })
137#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
138
139 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
140 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
141
142#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
143
144 TILE(DATA_TYPE, 64, N0, in);
145 TILE(DATA_TYPE, 4, N0, out);
146 TILE(DATA_TYPE, 16, N0, tmp);
147 TILE(uint, 64, 1, src_indirect_y);
148
149 // Calculate the indirect Y for the source tensor
150 LOOP_UNROLLING(int, i, 0, 1, 64,
151 {
152 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
153 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
154 })
155
156 // Initialize the input tile
157 LOOP_UNROLLING(int, i, 0, 1, 64,
158 {
159 in[i].v = 0;
160 })
161
162 // Load the values across the 64 channels to compose the 8x8 tile
163 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
164
165 LOOP_UNROLLING(int, i, 0, 1, 8,
166 {
167 tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100168 tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - (DATA_TYPE)2 * in[24 + i].v + (DATA_TYPE)2 * in[32 + i].v + (DATA_TYPE) - 3 * in[40 + i].v + (DATA_TYPE)3 * in[48 + i].v + in[56 + i].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100169 })
170
171 // Compute the 2x2 output tile
172 LOOP_UNROLLING(int, i, 0, 1, 2,
173 {
174 out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100175 out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - (DATA_TYPE)2 * tmp[6 + i].v + (DATA_TYPE)2 * tmp[8 + i].v - (DATA_TYPE)3 * tmp[10 + i].v + (DATA_TYPE)3 * tmp[12 + i].v + tmp[14 + i].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100176 })
177
178#if defined(HAS_BIAS)
179 // Add bias
180 TILE(DATA_TYPE, 1, N0, b);
181
182 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
183
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100184 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100185#endif // defined(HAS_BIAS)
186
187 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
188
189 TILE(uint, 4, 1, dst_indirect_y);
190
191 // Calculate the destination indirect Y
192 LOOP_UNROLLING(int, yk, 0, 1, 2,
193 {
194 LOOP_UNROLLING(int, xk, 0, 1, 2,
195 {
196 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
197 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
198 dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
199 dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
200 })
201 })
202
203 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
204 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
205#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
206}
ramelg01bb6877a2022-02-08 09:38:17 +0000207#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100208#endif // defined(VEC_SIZE) && VEC_SIZE == 2
209
210#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000211#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100212/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
213 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100214 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
215 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100216 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
217 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
218 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
219 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
220 *
221 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
222 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
223 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
224 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
225 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
226 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
227 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
228 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
229 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
230 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
231 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
232 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
233 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
234 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
235 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
236 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
237 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
238 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
239 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
240 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
241 * @param[in] dst_size Size of the destination tensor, minus the last padding
ramelg01bb6877a2022-02-08 09:38:17 +0000242 * @param[in] SRC_HEIGHT The source tensor's height
243 * @param[in] DST_WIDTH The destination tensor's width
244 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100245 */
246__kernel void winograd_output_transform_4x4_3x3_nhwc(
247 TENSOR4D(src, BUFFER),
248 TENSOR4D(dst, BUFFER),
249#if defined(HAS_BIAS)
250 VECTOR_DECLARATION(bias),
251#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000252 int dst_size,
253 const int SRC_HEIGHT,
254 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100255 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100256{
257 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
258 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100259#if defined(IS_BATCHED)
260 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
261#else // defined(IS_BATCHED)
262 const int bout = 0; // BATCH SIZE IDX
263#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100264
265#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
266
267 TILE(DATA_TYPE, 6, N0, in);
268 TILE(DATA_TYPE, 4, N0, out);
269 TILE(uint, 6, 1, src_indirect_y);
270
271 LOOP_UNROLLING(int, i, 0, 1, 6,
272 {
273 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
274 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
275 })
276
277 // Initialize the input tile
278 LOOP_UNROLLING(int, i, 0, 1, 6,
279 {
280 in[i].v = 0;
281 })
282
283 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
284 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
285
286 // Compute out00, out01, out02 and out03
287 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100288 out[1].v = in[1].v - in[2].v + (DATA_TYPE)2.0f * in[3].v - (DATA_TYPE)2.0f * in[4].v;
289 out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * in[3].v + (DATA_TYPE)4.0f * in[4].v;
290 out[3].v = in[1].v - in[2].v + (DATA_TYPE)8.0f * in[3].v - (DATA_TYPE)8.0f * in[4].v + in[5].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100291
292#if defined(HAS_BIAS)
293 TILE(DATA_TYPE, 1, N0, b);
294
295 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
296
297 // c = c + bias[broadcasted]
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100298 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100299#endif // HAS_BIAS
300
301 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
302 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
303
304 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
305
306 TILE(uint, 4, 1, dst_indirect_y);
307
308 // Calculate the destination indirect Y
309#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
310 LOOP_UNROLLING(int, yk, 0, 1, 4,
311 {
312 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
313 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
314 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
315 })
316#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
317 LOOP_UNROLLING(int, xk, 0, 1, 4,
318 {
319 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
320 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
321 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
322 })
323#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
324
325 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
326 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
327
328#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
329
330 // Calculate the indirect Y for the source tensor
331 TILE(DATA_TYPE, 36, N0, in);
332 TILE(DATA_TYPE, 4, N0, tmp);
333 TILE(uint, 36, 1, src_indirect_y);
334
335 LOOP_UNROLLING(int, i, 0, 1, 36,
336 {
337 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
338 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
339 })
340
341 // Initialize the input tile
342 LOOP_UNROLLING(int, i, 0, 1, 36,
343 {
344 in[i].v = 0;
345 })
346
347 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
348 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
349
350 LOOP_UNROLLING(int, i, 0, 1, 6,
351 {
352 tmp[0].v = in[6 + i].v + in[12 + i].v;
353 tmp[1].v = in[6 + i].v - in[12 + i].v;
354 tmp[2].v = in[18 + i].v + in[24 + i].v;
355 tmp[3].v = in[18 + i].v - in[24 + i].v;
356 tmp[3].v = tmp[3].v + tmp[3].v;
357 in[i].v = in[i].v + tmp[0].v + tmp[2].v;
358 in[6 + i].v = tmp[3].v + tmp[1].v;
359 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
360 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
361 })
362
363 // Compute the output tile
364 TILE(DATA_TYPE, 16, N0, out);
365
366 LOOP_UNROLLING(int, i, 0, 1, 4,
367 {
368 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
369 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
370 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
371 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
372 tmp[3].v = tmp[3].v + tmp[3].v;
373 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
374 out[4 * i + 1].v = tmp[3].v + tmp[1].v;
375 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
376 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
377 })
378
379#if defined(HAS_BIAS)
380 TILE(DATA_TYPE, 1, N0, b);
381
382 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
383
384 // c = c + bias[broadcasted]
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100385 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100386#endif // HAS_BIAS
387
388 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
389 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
390
391 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
392
393 TILE(uint, 16, 1, dst_indirect_y);
394
395 // Calculate the destination indirect Y
396 LOOP_UNROLLING(int, yk, 0, 1, 4,
397 {
398 LOOP_UNROLLING(int, xk, 0, 1, 4,
399 {
400 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
401 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
402 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
403 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
404 })
405 })
406
407 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
408 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
409#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
410}
ramelg01bb6877a2022-02-08 09:38:17 +0000411#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100412
ramelg01bb6877a2022-02-08 09:38:17 +0000413#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100414/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
415 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100416 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
417 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100418 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
419 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
420 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
421 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
422 *
423 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
424 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
425 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
426 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
427 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
428 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
429 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
430 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
431 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
432 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
433 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
434 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
435 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
436 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
437 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
438 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
439 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
440 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
441 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
442 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000443 * @param[in] SRC_HEIGHT The source tensor's height
444 * @param[in] DST_WIDTH The destination tensor's width
445 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100446 */
447__kernel void winograd_output_transform_4x4_5x5_nhwc(
448 TENSOR4D(src, BUFFER),
449 TENSOR4D(dst, BUFFER),
450#if defined(HAS_BIAS)
451 VECTOR_DECLARATION(bias),
452#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000453 int dst_size,
454 const int SRC_HEIGHT,
455 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100456 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100457{
458 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
459 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100460#if defined(IS_BATCHED)
461 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
462#else // defined(IS_BATCHED)
463 const int bout = 0; // BATCH SIZE IDX
464#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100465
466#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
467 TILE(DATA_TYPE, 8, N0, in);
468 TILE(DATA_TYPE, 4, N0, out);
469 TILE(DATA_TYPE, 4, N0, tmp);
470 TILE(uint, 8, 1, src_indirect_y);
471
472 LOOP_UNROLLING(int, i, 0, 1, 8,
473 {
474 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
475 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
476 })
477
478 // Initialize the input tile
479 LOOP_UNROLLING(int, i, 0, 1, 8,
480 {
481 in[i].v = 0;
482 })
483
484 // "in" contains 1x8 or 8x1 tile here
485 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
486
487 // A^T * in, and in this degenerate case out consists of 1 column/row
488 tmp[0].v = in[1].v - in[2].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100489 tmp[1].v = (DATA_TYPE)2.0f * (in[3].v - in[4].v);
490 tmp[2].v = (DATA_TYPE)2.0f * (in[5].v + in[6].v);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100491 tmp[3].v = in[3].v + in[4].v;
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100492 out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + (DATA_TYPE)4.0f * tmp[2].v;
493 out[1].v = tmp[0].v + tmp[1].v + (DATA_TYPE)4.0f * (in[5].v - in[6].v);
494 out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * tmp[3].v + tmp[2].v;
495 out[3].v = tmp[0].v + (DATA_TYPE)4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100496
497#if defined(HAS_BIAS)
498 TILE(DATA_TYPE, 1, N0, b);
499
500 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
501
502 // c = c + bias[broadcasted]
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100503 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100504#endif // HAS_BIAS
505
506 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
507 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
508
509 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
510
511 TILE(uint, 4, 1, dst_indirect_y);
512
513 // Calculate the destination indirect Y
514#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
515 LOOP_UNROLLING(int, yk, 0, 1, 4,
516 {
517 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
518 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
519 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
520 })
521#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
522 LOOP_UNROLLING(int, xk, 0, 1, 4,
523 {
524 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
525 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
526 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
527 })
528#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
529
530 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
531 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
532
533#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
534 // Calculate the indirect Y for the source tensor
535 TILE(DATA_TYPE, 64, N0, in);
536 TILE(DATA_TYPE, 6, N0, tmp);
537 TILE(uint, 64, 1, src_indirect_y);
538
539 LOOP_UNROLLING(int, i, 0, 1, 64,
540 {
541 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
542 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
543 })
544
545 // Initialize the input tile
546 LOOP_UNROLLING(int, i, 0, 1, 64,
547 {
548 in[i].v = 0;
549 })
550
551 // "in" here is 8x8 tile
552 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
553
554 // A^T * in
555 LOOP_UNROLLING(int, i, 0, 1, 8,
556 {
557 tmp[0].v = in[8 + i].v + in[16 + i].v;
558 tmp[1].v = in[8 + i].v - in[16 + i].v;
559 tmp[2].v = in[24 + i].v + in[32 + i].v;
560 tmp[3].v = in[24 + i].v - in[32 + i].v;
561 tmp[3].v = tmp[3].v + tmp[3].v;
562 tmp[4].v = in[40 + i].v + in[48 + i].v;
563 tmp[4].v = tmp[4].v + tmp[4].v;
564 tmp[5].v = in[40 + i].v - in[48 + i].v;
565
566 // 4x8 matrix as a result
567 in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
568 in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
569 in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
570 in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
571 })
572
573 // Compute the output tile
574 TILE(DATA_TYPE, 16, N0, out);
575
576 // in * A, with in = A^T * in as above
577 LOOP_UNROLLING(int, i, 0, 1, 4,
578 {
579 tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
580 tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
581 tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
582 tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
583 tmp[3].v = tmp[3].v + tmp[3].v;
584 tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
585 tmp[4].v = tmp[4].v + tmp[4].v;
586 tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
587
588 // 4x4 tile
589 out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
590 out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
591 out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
592 out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
593 })
594
595#if defined(HAS_BIAS)
596 TILE(DATA_TYPE, 1, N0, b);
597
598 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
599
600 // c = c + bias[broadcasted]
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100601 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100602#endif // HAS_BIAS
603
604 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
605 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
606
607 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
608
609 TILE(uint, 16, 1, dst_indirect_y);
610
611 // Calculate the destination indirect Y
612 LOOP_UNROLLING(int, yk, 0, 1, 4,
613 {
614 LOOP_UNROLLING(int, xk, 0, 1, 4,
615 {
616 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
617 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
618 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
619 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
620 })
621 })
622
623 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
624 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
625#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
626}
ramelg01bb6877a2022-02-08 09:38:17 +0000627#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100628#endif // defined(VEC_SIZE) && VEC_SIZE == 4
629
630#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
631#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +0000632#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100633/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
634 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100635 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
636 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
637 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
638 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
639 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
640 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
641 *
642 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
643 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
644 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
645 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
646 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
647 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
648 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
649 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
650 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
651 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
652 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
653 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
654 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
655 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
656 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
657 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
658 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
659 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
660 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
661 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000662 * @param[in] SRC_HEIGHT The source tensor's height
663 * @param[in] DST_WIDTH The destination tensor's width
664 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100665 */
666__kernel void winograd_output_transform_2x1_7x1_nhwc(
667 TENSOR4D_DECLARATION(src),
668 TENSOR4D_DECLARATION(dst),
669#if defined(HAS_BIAS)
670 VECTOR_DECLARATION(bias),
671#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000672 int dst_size,
673 const int SRC_HEIGHT,
674 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100675 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100676{
677 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
678 src_stride_x,
679 src_step_x,
680 src_stride_y,
681 src_step_y,
682 src_stride_z,
683 src_step_z,
684 src_stride_w,
685 src_step_w,
686 src_offset_first_element_in_bytes,
687 dst_ptr,
688 dst_stride_x,
689 dst_step_x,
690 dst_stride_y,
691 dst_step_y,
692 dst_stride_z,
693 dst_step_z,
694 dst_stride_w,
695 dst_step_w,
696 dst_offset_first_element_in_bytes,
697#if defined(HAS_BIAS)
698 bias_ptr,
699 bias_stride_x,
700 bias_step_x,
701 bias_offset_first_element_in_bytes,
702#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000703 dst_size,
704 SRC_HEIGHT,
705 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100706 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100707}
ramelg01bb6877a2022-02-08 09:38:17 +0000708#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100709#endif // defined(VEC_SIZE) && VEC_SIZE == 2
710
711#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000712#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100713/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
714 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100715 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
716 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
717 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
718 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
719 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
720 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
721 *
722 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
723 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
724 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
725 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
726 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
727 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
728 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
729 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
730 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
731 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
732 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
733 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
734 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
735 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
736 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
737 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
738 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
739 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
740 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
741 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000742 * @param[in] SRC_HEIGHT The source tensor's height
743 * @param[in] DST_WIDTH The destination tensor's width
744 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100745 */
746__kernel void winograd_output_transform_4x1_3x1_nhwc(
747 TENSOR4D_DECLARATION(src),
748 TENSOR4D_DECLARATION(dst),
749#if defined(HAS_BIAS)
750 VECTOR_DECLARATION(bias),
751#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000752 int dst_size,
753 const int SRC_HEIGHT,
754 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100755 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100756{
757 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
758 src_stride_x,
759 src_step_x,
760 src_stride_y,
761 src_step_y,
762 src_stride_z,
763 src_step_z,
764 src_stride_w,
765 src_step_w,
766 src_offset_first_element_in_bytes,
767 dst_ptr,
768 dst_stride_x,
769 dst_step_x,
770 dst_stride_y,
771 dst_step_y,
772 dst_stride_z,
773 dst_step_z,
774 dst_stride_w,
775 dst_step_w,
776 dst_offset_first_element_in_bytes,
777#if defined(HAS_BIAS)
778 bias_ptr,
779 bias_stride_x,
780 bias_step_x,
781 bias_offset_first_element_in_bytes,
782#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000783 dst_size,
784 SRC_HEIGHT,
785 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100786 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100787}
ramelg01bb6877a2022-02-08 09:38:17 +0000788#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100789
ramelg01bb6877a2022-02-08 09:38:17 +0000790#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100791/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
792 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100793 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
794 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
795 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
796 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
797 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
798 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
799 *
800 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
801 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
802 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
803 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
804 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
805 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
806 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
807 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
808 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
809 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
810 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
811 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
812 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
813 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
814 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
815 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
816 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
817 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
818 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
819 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000820 * @param[in] SRC_HEIGHT The source tensor's height
821 * @param[in] DST_WIDTH The destination tensor's width
822 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100823 */
824__kernel void winograd_output_transform_4x1_5x1_nhwc(
825 TENSOR4D_DECLARATION(src),
826 TENSOR4D_DECLARATION(dst),
827#if defined(HAS_BIAS)
828 VECTOR_DECLARATION(bias),
829#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000830 int dst_size,
831 const int SRC_HEIGHT,
832 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100833 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100834{
835 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
836 src_stride_x,
837 src_step_x,
838 src_stride_y,
839 src_step_y,
840 src_stride_z,
841 src_step_z,
842 src_stride_w,
843 src_step_w,
844 src_offset_first_element_in_bytes,
845 dst_ptr,
846 dst_stride_x,
847 dst_step_x,
848 dst_stride_y,
849 dst_step_y,
850 dst_stride_z,
851 dst_step_z,
852 dst_stride_w,
853 dst_step_w,
854 dst_offset_first_element_in_bytes,
855#if defined(HAS_BIAS)
856 bias_ptr,
857 bias_stride_x,
858 bias_step_x,
859 bias_offset_first_element_in_bytes,
860#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000861 dst_size,
862 SRC_HEIGHT,
863 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100864 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100865}
ramelg01bb6877a2022-02-08 09:38:17 +0000866#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100867#endif // defined(VEC_SIZE) && VEC_SIZE == 4
868#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
869
870#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
871#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +0000872#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100873/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
874 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100875 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
876 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
877 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
878 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
879 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
880 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
881 *
882 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
883 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
884 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
885 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
886 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
887 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
888 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
889 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
890 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
891 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
892 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
893 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
894 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
895 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
896 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
897 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
898 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
899 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
900 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
901 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000902 * @param[in] SRC_HEIGHT The source tensor's height
903 * @param[in] DST_WIDTH The destination tensor's width
904 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100905 */
906__kernel void winograd_output_transform_1x2_1x7_nhwc(
907 TENSOR4D_DECLARATION(src),
908 TENSOR4D_DECLARATION(dst),
909#if defined(HAS_BIAS)
910 VECTOR_DECLARATION(bias),
911#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000912 int dst_size,
913 const int SRC_HEIGHT,
914 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100915 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100916{
917 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
918 src_stride_x,
919 src_step_x,
920 src_stride_y,
921 src_step_y,
922 src_stride_z,
923 src_step_z,
924 src_stride_w,
925 src_step_w,
926 src_offset_first_element_in_bytes,
927 dst_ptr,
928 dst_stride_x,
929 dst_step_x,
930 dst_stride_y,
931 dst_step_y,
932 dst_stride_z,
933 dst_step_z,
934 dst_stride_w,
935 dst_step_w,
936 dst_offset_first_element_in_bytes,
937#if defined(HAS_BIAS)
938 bias_ptr,
939 bias_stride_x,
940 bias_step_x,
941 bias_offset_first_element_in_bytes,
942#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000943 dst_size,
944 SRC_HEIGHT,
945 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100946 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100947}
ramelg01bb6877a2022-02-08 09:38:17 +0000948#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100949#endif // defined(VEC_SIZE) && VEC_SIZE == 2
950
951#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000952#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100953/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
954 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100955 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
956 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
957 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
958 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
959 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
960 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
961 *
962 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
963 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
964 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
965 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
966 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
967 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
968 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
969 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
970 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
971 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
972 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
973 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
974 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
975 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
976 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
977 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
978 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
979 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
980 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
981 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000982 * @param[in] SRC_HEIGHT The source tensor's height
983 * @param[in] DST_WIDTH The destination tensor's width
984 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100985 */
986__kernel void winograd_output_transform_1x4_1x3_nhwc(
987 TENSOR4D_DECLARATION(src),
988 TENSOR4D_DECLARATION(dst),
989#if defined(HAS_BIAS)
990 VECTOR_DECLARATION(bias),
991#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000992 int dst_size,
993 const int SRC_HEIGHT,
994 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100995 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100996{
997 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
998 src_stride_x,
999 src_step_x,
1000 src_stride_y,
1001 src_step_y,
1002 src_stride_z,
1003 src_step_z,
1004 src_stride_w,
1005 src_step_w,
1006 src_offset_first_element_in_bytes,
1007 dst_ptr,
1008 dst_stride_x,
1009 dst_step_x,
1010 dst_stride_y,
1011 dst_step_y,
1012 dst_stride_z,
1013 dst_step_z,
1014 dst_stride_w,
1015 dst_step_w,
1016 dst_offset_first_element_in_bytes,
1017#if defined(HAS_BIAS)
1018 bias_ptr,
1019 bias_stride_x,
1020 bias_step_x,
1021 bias_offset_first_element_in_bytes,
1022#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001023 dst_size,
1024 SRC_HEIGHT,
1025 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001026 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001027}
ramelg01bb6877a2022-02-08 09:38:17 +00001028#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001029
ramelg01bb6877a2022-02-08 09:38:17 +00001030#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001031/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1032 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001033 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1034 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1035 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1036 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
1037 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1038 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1039 *
1040 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1041 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1042 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1044 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1046 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1047 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1048 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1049 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1050 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1051 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1052 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1053 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1054 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1055 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1056 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1057 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1058 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1059 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +00001060 * @param[in] SRC_HEIGHT The source tensor's height
1061 * @param[in] DST_WIDTH The destination tensor's width
1062 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001063 */
1064__kernel void winograd_output_transform_1x4_1x5_nhwc(
1065 TENSOR4D_DECLARATION(src),
1066 TENSOR4D_DECLARATION(dst),
1067#if defined(HAS_BIAS)
1068 VECTOR_DECLARATION(bias),
1069#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001070 int dst_size,
1071 const int SRC_HEIGHT,
1072 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001073 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001074{
1075 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1076 src_stride_x,
1077 src_step_x,
1078 src_stride_y,
1079 src_step_y,
1080 src_stride_z,
1081 src_step_z,
1082 src_stride_w,
1083 src_step_w,
1084 src_offset_first_element_in_bytes,
1085 dst_ptr,
1086 dst_stride_x,
1087 dst_step_x,
1088 dst_stride_y,
1089 dst_step_y,
1090 dst_stride_z,
1091 dst_step_z,
1092 dst_stride_w,
1093 dst_step_w,
1094 dst_offset_first_element_in_bytes,
1095#if defined(HAS_BIAS)
1096 bias_ptr,
1097 bias_stride_x,
1098 bias_step_x,
1099 bias_offset_first_element_in_bytes,
1100#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001101 dst_size,
1102 SRC_HEIGHT,
1103 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001104 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001105}
ramelg01bb6877a2022-02-08 09:38:17 +00001106#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001107#endif // defined(VEC_SIZE) && VEC_SIZE == 4
1108#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice905a3c12023-04-14 12:20:58 +01001109#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)