blob: bab2ee850c7cec8f0c7df1bf62ce61f891072291 [file] [log] [blame]
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001/*
ramelg01bb6877a2022-02-08 09:38:17 +00002 * Copyright (c) 2018-2022 Arm Limited.
Adnan AlSinan7075fe22021-07-05 13:12:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "activation_float_helpers.h"
25#include "helpers.h"
26#include "tile_helpers.h"
27
ramelg01bb6877a2022-02-08 09:38:17 +000028#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010029#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +000030#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010031/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
32 *
ramelg01bb6877a2022-02-08 09:38:17 +000033 * @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
Adnan AlSinan7075fe22021-07-05 13:12:52 +010034 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
35 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
Adnan AlSinan7075fe22021-07-05 13:12:52 +010036 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
37 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
38 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
39 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
40 *
41 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
42 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
43 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
44 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
45 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
46 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
47 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
48 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
49 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
50 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
51 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
52 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
53 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
54 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
55 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
56 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
57 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
58 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
59 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
60 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +000061 * @param[in] _ISRC_HEIGHT The source tensor's height
62 * @param[in] _IDST_WIDTH The destination tensor's width
63 * @param[in] _IDST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +010064 */
65__kernel void winograd_output_transform_2x2_7x7_nhwc(
66 TENSOR4D(src, BUFFER),
67 TENSOR4D(dst, BUFFER),
68#if defined(HAS_BIAS)
69 VECTOR_DECLARATION(bias),
70#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +000071 int dst_size,
72 const int _ISRC_HEIGHT,
73 const int _IDST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +010074 const int _IDST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010075{
Adnan AlSinan7075fe22021-07-05 13:12:52 +010076 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
77 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
78 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
79
Gian Marco Iodice0c687042022-06-14 15:13:16 +010080 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
81 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
Adnan AlSinan7075fe22021-07-05 13:12:52 +010082
83#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
84 TILE(DATA_TYPE, 8, N0, in);
85 TILE(DATA_TYPE, 2, N0, out);
86 TILE(uint, 8, 1, src_indirect_y);
87
88 // Calculate the indirect Y for the source tensor
89 LOOP_UNROLLING(int, i, 0, 1, 8,
90 {
91 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
92 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
93 })
94
95 // Initialize the input tile
96 LOOP_UNROLLING(int, i, 0, 1, 8,
97 {
98 in[i].v = 0;
99 })
100
101 // Load the values across the 8 channels to compose the 8x1 tile
102 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
103
104 // Compute out0 and out01
105 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
106 out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
107
108#if defined(HAS_BIAS)
109 // Add bias
110 TILE(DATA_TYPE, 1, N0, b);
111
112 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
113
114 T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
115#endif // defined(HAS_BIAS)
116
117 T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
118
119 TILE(uint, 2, 1, dst_indirect_y);
120
121#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
122 LOOP_UNROLLING(int, yk, 0, 1, 2,
123 {
124 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
125 dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
126 })
127#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
128 LOOP_UNROLLING(int, xk, 0, 1, 2,
129 {
130 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
131 dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
132 })
133#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
134
135 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
136 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
137
138#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
139
140 TILE(DATA_TYPE, 64, N0, in);
141 TILE(DATA_TYPE, 4, N0, out);
142 TILE(DATA_TYPE, 16, N0, tmp);
143 TILE(uint, 64, 1, src_indirect_y);
144
145 // Calculate the indirect Y for the source tensor
146 LOOP_UNROLLING(int, i, 0, 1, 64,
147 {
148 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
149 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
150 })
151
152 // Initialize the input tile
153 LOOP_UNROLLING(int, i, 0, 1, 64,
154 {
155 in[i].v = 0;
156 })
157
158 // Load the values across the 64 channels to compose the 8x8 tile
159 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
160
161 LOOP_UNROLLING(int, i, 0, 1, 8,
162 {
163 tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
164 tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
165 })
166
167 // Compute the 2x2 output tile
168 LOOP_UNROLLING(int, i, 0, 1, 2,
169 {
170 out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
171 out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
172 })
173
174#if defined(HAS_BIAS)
175 // Add bias
176 TILE(DATA_TYPE, 1, N0, b);
177
178 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
179
180 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
181#endif // defined(HAS_BIAS)
182
183 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
184
185 TILE(uint, 4, 1, dst_indirect_y);
186
187 // Calculate the destination indirect Y
188 LOOP_UNROLLING(int, yk, 0, 1, 2,
189 {
190 LOOP_UNROLLING(int, xk, 0, 1, 2,
191 {
192 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
193 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
194 dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
195 dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
196 })
197 })
198
199 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
200 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
201#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
202}
ramelg01bb6877a2022-02-08 09:38:17 +0000203#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100204#endif // defined(VEC_SIZE) && VEC_SIZE == 2
205
206#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000207#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100208/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
209 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100210 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
211 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100212 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
213 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
214 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
215 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
216 *
217 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
218 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
219 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
220 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
221 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
222 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
223 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
224 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
225 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
226 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
227 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
228 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
229 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
230 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
231 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
232 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
233 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
234 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
235 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
236 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
237 * @param[in] dst_size Size of the destination tensor, minus the last padding
ramelg01bb6877a2022-02-08 09:38:17 +0000238 * @param[in] SRC_HEIGHT The source tensor's height
239 * @param[in] DST_WIDTH The destination tensor's width
240 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100241 */
242__kernel void winograd_output_transform_4x4_3x3_nhwc(
243 TENSOR4D(src, BUFFER),
244 TENSOR4D(dst, BUFFER),
245#if defined(HAS_BIAS)
246 VECTOR_DECLARATION(bias),
247#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000248 int dst_size,
249 const int SRC_HEIGHT,
250 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100251 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100252{
253 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
254 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
255 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
256
257#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
258
259 TILE(DATA_TYPE, 6, N0, in);
260 TILE(DATA_TYPE, 4, N0, out);
261 TILE(uint, 6, 1, src_indirect_y);
262
263 LOOP_UNROLLING(int, i, 0, 1, 6,
264 {
265 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
266 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
267 })
268
269 // Initialize the input tile
270 LOOP_UNROLLING(int, i, 0, 1, 6,
271 {
272 in[i].v = 0;
273 })
274
275 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
276 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
277
278 // Compute out00, out01, out02 and out03
279 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
280 out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
281 out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
282 out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
283
284#if defined(HAS_BIAS)
285 TILE(DATA_TYPE, 1, N0, b);
286
287 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
288
289 // c = c + bias[broadcasted]
290 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
291#endif // HAS_BIAS
292
293 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
294 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
295
296 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
297
298 TILE(uint, 4, 1, dst_indirect_y);
299
300 // Calculate the destination indirect Y
301#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
302 LOOP_UNROLLING(int, yk, 0, 1, 4,
303 {
304 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
305 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
306 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
307 })
308#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
309 LOOP_UNROLLING(int, xk, 0, 1, 4,
310 {
311 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
312 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
313 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
314 })
315#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
316
317 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
318 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
319
320#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
321
322 // Calculate the indirect Y for the source tensor
323 TILE(DATA_TYPE, 36, N0, in);
324 TILE(DATA_TYPE, 4, N0, tmp);
325 TILE(uint, 36, 1, src_indirect_y);
326
327 LOOP_UNROLLING(int, i, 0, 1, 36,
328 {
329 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
330 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
331 })
332
333 // Initialize the input tile
334 LOOP_UNROLLING(int, i, 0, 1, 36,
335 {
336 in[i].v = 0;
337 })
338
339 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
340 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
341
342 LOOP_UNROLLING(int, i, 0, 1, 6,
343 {
344 tmp[0].v = in[6 + i].v + in[12 + i].v;
345 tmp[1].v = in[6 + i].v - in[12 + i].v;
346 tmp[2].v = in[18 + i].v + in[24 + i].v;
347 tmp[3].v = in[18 + i].v - in[24 + i].v;
348 tmp[3].v = tmp[3].v + tmp[3].v;
349 in[i].v = in[i].v + tmp[0].v + tmp[2].v;
350 in[6 + i].v = tmp[3].v + tmp[1].v;
351 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
352 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
353 })
354
355 // Compute the output tile
356 TILE(DATA_TYPE, 16, N0, out);
357
358 LOOP_UNROLLING(int, i, 0, 1, 4,
359 {
360 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
361 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
362 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
363 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
364 tmp[3].v = tmp[3].v + tmp[3].v;
365 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
366 out[4 * i + 1].v = tmp[3].v + tmp[1].v;
367 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
368 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
369 })
370
371#if defined(HAS_BIAS)
372 TILE(DATA_TYPE, 1, N0, b);
373
374 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
375
376 // c = c + bias[broadcasted]
377 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
378#endif // HAS_BIAS
379
380 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
381 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
382
383 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
384
385 TILE(uint, 16, 1, dst_indirect_y);
386
387 // Calculate the destination indirect Y
388 LOOP_UNROLLING(int, yk, 0, 1, 4,
389 {
390 LOOP_UNROLLING(int, xk, 0, 1, 4,
391 {
392 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
393 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
394 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
395 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
396 })
397 })
398
399 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
400 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
401#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
402}
ramelg01bb6877a2022-02-08 09:38:17 +0000403#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100404
ramelg01bb6877a2022-02-08 09:38:17 +0000405#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100406/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
407 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100408 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
409 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100410 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
411 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
412 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
413 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
414 *
415 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
416 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
417 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
419 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
421 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
423 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
424 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
425 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
426 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
427 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
429 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
430 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
431 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
432 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
433 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
434 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000435 * @param[in] SRC_HEIGHT The source tensor's height
436 * @param[in] DST_WIDTH The destination tensor's width
437 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100438 */
439__kernel void winograd_output_transform_4x4_5x5_nhwc(
440 TENSOR4D(src, BUFFER),
441 TENSOR4D(dst, BUFFER),
442#if defined(HAS_BIAS)
443 VECTOR_DECLARATION(bias),
444#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000445 int dst_size,
446 const int SRC_HEIGHT,
447 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100448 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100449{
450 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
451 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
452 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
453
454#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
455 TILE(DATA_TYPE, 8, N0, in);
456 TILE(DATA_TYPE, 4, N0, out);
457 TILE(DATA_TYPE, 4, N0, tmp);
458 TILE(uint, 8, 1, src_indirect_y);
459
460 LOOP_UNROLLING(int, i, 0, 1, 8,
461 {
462 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
463 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
464 })
465
466 // Initialize the input tile
467 LOOP_UNROLLING(int, i, 0, 1, 8,
468 {
469 in[i].v = 0;
470 })
471
472 // "in" contains 1x8 or 8x1 tile here
473 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
474
475 // A^T * in, and in this degenerate case out consists of 1 column/row
476 tmp[0].v = in[1].v - in[2].v;
477 tmp[1].v = 2.0f * (in[3].v - in[4].v);
478 tmp[2].v = 2.0f * (in[5].v + in[6].v);
479 tmp[3].v = in[3].v + in[4].v;
480 out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
481 out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
482 out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
483 out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
484
485#if defined(HAS_BIAS)
486 TILE(DATA_TYPE, 1, N0, b);
487
488 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
489
490 // c = c + bias[broadcasted]
491 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
492#endif // HAS_BIAS
493
494 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
495 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
496
497 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
498
499 TILE(uint, 4, 1, dst_indirect_y);
500
501 // Calculate the destination indirect Y
502#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
503 LOOP_UNROLLING(int, yk, 0, 1, 4,
504 {
505 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
506 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
507 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
508 })
509#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
510 LOOP_UNROLLING(int, xk, 0, 1, 4,
511 {
512 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
513 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
514 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
515 })
516#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
517
518 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
519 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
520
521#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
522 // Calculate the indirect Y for the source tensor
523 TILE(DATA_TYPE, 64, N0, in);
524 TILE(DATA_TYPE, 6, N0, tmp);
525 TILE(uint, 64, 1, src_indirect_y);
526
527 LOOP_UNROLLING(int, i, 0, 1, 64,
528 {
529 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
530 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
531 })
532
533 // Initialize the input tile
534 LOOP_UNROLLING(int, i, 0, 1, 64,
535 {
536 in[i].v = 0;
537 })
538
539 // "in" here is 8x8 tile
540 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
541
542 // A^T * in
543 LOOP_UNROLLING(int, i, 0, 1, 8,
544 {
545 tmp[0].v = in[8 + i].v + in[16 + i].v;
546 tmp[1].v = in[8 + i].v - in[16 + i].v;
547 tmp[2].v = in[24 + i].v + in[32 + i].v;
548 tmp[3].v = in[24 + i].v - in[32 + i].v;
549 tmp[3].v = tmp[3].v + tmp[3].v;
550 tmp[4].v = in[40 + i].v + in[48 + i].v;
551 tmp[4].v = tmp[4].v + tmp[4].v;
552 tmp[5].v = in[40 + i].v - in[48 + i].v;
553
554 // 4x8 matrix as a result
555 in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
556 in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
557 in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
558 in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
559 })
560
561 // Compute the output tile
562 TILE(DATA_TYPE, 16, N0, out);
563
564 // in * A, with in = A^T * in as above
565 LOOP_UNROLLING(int, i, 0, 1, 4,
566 {
567 tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
568 tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
569 tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
570 tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
571 tmp[3].v = tmp[3].v + tmp[3].v;
572 tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
573 tmp[4].v = tmp[4].v + tmp[4].v;
574 tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
575
576 // 4x4 tile
577 out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
578 out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
579 out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
580 out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
581 })
582
583#if defined(HAS_BIAS)
584 TILE(DATA_TYPE, 1, N0, b);
585
586 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
587
588 // c = c + bias[broadcasted]
589 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
590#endif // HAS_BIAS
591
592 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
593 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
594
595 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
596
597 TILE(uint, 16, 1, dst_indirect_y);
598
599 // Calculate the destination indirect Y
600 LOOP_UNROLLING(int, yk, 0, 1, 4,
601 {
602 LOOP_UNROLLING(int, xk, 0, 1, 4,
603 {
604 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
605 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
606 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
607 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
608 })
609 })
610
611 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
612 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
613#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
614}
ramelg01bb6877a2022-02-08 09:38:17 +0000615#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100616#endif // defined(VEC_SIZE) && VEC_SIZE == 4
617
618#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
619#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +0000620#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100621/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
622 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100623 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
624 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
625 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
626 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
627 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
628 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
629 *
630 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
631 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
632 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
633 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
634 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
635 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
636 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
637 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
638 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
639 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
640 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
641 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
642 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
643 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
644 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
645 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
646 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
647 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
648 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
649 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000650 * @param[in] SRC_HEIGHT The source tensor's height
651 * @param[in] DST_WIDTH The destination tensor's width
652 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100653 */
654__kernel void winograd_output_transform_2x1_7x1_nhwc(
655 TENSOR4D_DECLARATION(src),
656 TENSOR4D_DECLARATION(dst),
657#if defined(HAS_BIAS)
658 VECTOR_DECLARATION(bias),
659#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000660 int dst_size,
661 const int SRC_HEIGHT,
662 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100663 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100664{
665 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
666 src_stride_x,
667 src_step_x,
668 src_stride_y,
669 src_step_y,
670 src_stride_z,
671 src_step_z,
672 src_stride_w,
673 src_step_w,
674 src_offset_first_element_in_bytes,
675 dst_ptr,
676 dst_stride_x,
677 dst_step_x,
678 dst_stride_y,
679 dst_step_y,
680 dst_stride_z,
681 dst_step_z,
682 dst_stride_w,
683 dst_step_w,
684 dst_offset_first_element_in_bytes,
685#if defined(HAS_BIAS)
686 bias_ptr,
687 bias_stride_x,
688 bias_step_x,
689 bias_offset_first_element_in_bytes,
690#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000691 dst_size,
692 SRC_HEIGHT,
693 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100694 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100695}
ramelg01bb6877a2022-02-08 09:38:17 +0000696#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100697#endif // defined(VEC_SIZE) && VEC_SIZE == 2
698
699#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000700#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100701/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
702 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100703 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
704 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
705 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
706 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
707 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
708 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
709 *
710 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
711 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
712 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
713 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
714 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
715 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
716 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
717 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
718 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
719 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
720 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
721 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
722 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
723 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
724 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
725 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
726 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
727 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
728 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
729 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000730 * @param[in] SRC_HEIGHT The source tensor's height
731 * @param[in] DST_WIDTH The destination tensor's width
732 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100733 */
734__kernel void winograd_output_transform_4x1_3x1_nhwc(
735 TENSOR4D_DECLARATION(src),
736 TENSOR4D_DECLARATION(dst),
737#if defined(HAS_BIAS)
738 VECTOR_DECLARATION(bias),
739#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000740 int dst_size,
741 const int SRC_HEIGHT,
742 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100743 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100744{
745 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
746 src_stride_x,
747 src_step_x,
748 src_stride_y,
749 src_step_y,
750 src_stride_z,
751 src_step_z,
752 src_stride_w,
753 src_step_w,
754 src_offset_first_element_in_bytes,
755 dst_ptr,
756 dst_stride_x,
757 dst_step_x,
758 dst_stride_y,
759 dst_step_y,
760 dst_stride_z,
761 dst_step_z,
762 dst_stride_w,
763 dst_step_w,
764 dst_offset_first_element_in_bytes,
765#if defined(HAS_BIAS)
766 bias_ptr,
767 bias_stride_x,
768 bias_step_x,
769 bias_offset_first_element_in_bytes,
770#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000771 dst_size,
772 SRC_HEIGHT,
773 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100774 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100775}
ramelg01bb6877a2022-02-08 09:38:17 +0000776#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100777
ramelg01bb6877a2022-02-08 09:38:17 +0000778#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100779/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
780 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100781 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
782 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
783 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
784 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
785 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
786 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
787 *
788 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
789 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
790 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
791 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
792 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
793 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
794 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
795 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
796 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
797 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
798 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
799 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
800 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
801 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
802 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
803 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
804 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
805 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
806 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
807 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000808 * @param[in] SRC_HEIGHT The source tensor's height
809 * @param[in] DST_WIDTH The destination tensor's width
810 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100811 */
812__kernel void winograd_output_transform_4x1_5x1_nhwc(
813 TENSOR4D_DECLARATION(src),
814 TENSOR4D_DECLARATION(dst),
815#if defined(HAS_BIAS)
816 VECTOR_DECLARATION(bias),
817#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000818 int dst_size,
819 const int SRC_HEIGHT,
820 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100821 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100822{
823 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
824 src_stride_x,
825 src_step_x,
826 src_stride_y,
827 src_step_y,
828 src_stride_z,
829 src_step_z,
830 src_stride_w,
831 src_step_w,
832 src_offset_first_element_in_bytes,
833 dst_ptr,
834 dst_stride_x,
835 dst_step_x,
836 dst_stride_y,
837 dst_step_y,
838 dst_stride_z,
839 dst_step_z,
840 dst_stride_w,
841 dst_step_w,
842 dst_offset_first_element_in_bytes,
843#if defined(HAS_BIAS)
844 bias_ptr,
845 bias_stride_x,
846 bias_step_x,
847 bias_offset_first_element_in_bytes,
848#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000849 dst_size,
850 SRC_HEIGHT,
851 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100852 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100853}
ramelg01bb6877a2022-02-08 09:38:17 +0000854#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100855#endif // defined(VEC_SIZE) && VEC_SIZE == 4
856#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
857
858#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
859#if defined(VEC_SIZE) && VEC_SIZE == 2
ramelg01bb6877a2022-02-08 09:38:17 +0000860#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100861/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
862 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100863 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
864 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
865 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
866 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
867 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
868 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
869 *
870 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
871 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
872 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
873 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
874 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
875 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
876 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
877 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
878 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
879 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
880 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
881 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
882 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
883 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
884 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
885 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
886 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
887 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
888 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
889 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000890 * @param[in] SRC_HEIGHT The source tensor's height
891 * @param[in] DST_WIDTH The destination tensor's width
892 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100893 */
894__kernel void winograd_output_transform_1x2_1x7_nhwc(
895 TENSOR4D_DECLARATION(src),
896 TENSOR4D_DECLARATION(dst),
897#if defined(HAS_BIAS)
898 VECTOR_DECLARATION(bias),
899#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000900 int dst_size,
901 const int SRC_HEIGHT,
902 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100903 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100904{
905 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
906 src_stride_x,
907 src_step_x,
908 src_stride_y,
909 src_step_y,
910 src_stride_z,
911 src_step_z,
912 src_stride_w,
913 src_step_w,
914 src_offset_first_element_in_bytes,
915 dst_ptr,
916 dst_stride_x,
917 dst_step_x,
918 dst_stride_y,
919 dst_step_y,
920 dst_stride_z,
921 dst_step_z,
922 dst_stride_w,
923 dst_step_w,
924 dst_offset_first_element_in_bytes,
925#if defined(HAS_BIAS)
926 bias_ptr,
927 bias_stride_x,
928 bias_step_x,
929 bias_offset_first_element_in_bytes,
930#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000931 dst_size,
932 SRC_HEIGHT,
933 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100934 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100935}
ramelg01bb6877a2022-02-08 09:38:17 +0000936#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100937#endif // defined(VEC_SIZE) && VEC_SIZE == 2
938
939#if defined(VEC_SIZE) && VEC_SIZE == 4
ramelg01bb6877a2022-02-08 09:38:17 +0000940#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100941/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
942 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100943 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
944 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
945 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
946 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
947 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
948 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
949 *
950 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
951 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
952 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
953 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
954 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
955 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
956 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
957 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
958 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
959 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
960 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
961 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
962 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
963 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
964 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
965 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
966 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
967 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
968 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
969 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +0000970 * @param[in] SRC_HEIGHT The source tensor's height
971 * @param[in] DST_WIDTH The destination tensor's width
972 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100973 */
974__kernel void winograd_output_transform_1x4_1x3_nhwc(
975 TENSOR4D_DECLARATION(src),
976 TENSOR4D_DECLARATION(dst),
977#if defined(HAS_BIAS)
978 VECTOR_DECLARATION(bias),
979#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +0000980 int dst_size,
981 const int SRC_HEIGHT,
982 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +0100983 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100984{
985 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
986 src_stride_x,
987 src_step_x,
988 src_stride_y,
989 src_step_y,
990 src_stride_z,
991 src_step_z,
992 src_stride_w,
993 src_step_w,
994 src_offset_first_element_in_bytes,
995 dst_ptr,
996 dst_stride_x,
997 dst_step_x,
998 dst_stride_y,
999 dst_step_y,
1000 dst_stride_z,
1001 dst_step_z,
1002 dst_stride_w,
1003 dst_step_w,
1004 dst_offset_first_element_in_bytes,
1005#if defined(HAS_BIAS)
1006 bias_ptr,
1007 bias_stride_x,
1008 bias_step_x,
1009 bias_offset_first_element_in_bytes,
1010#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001011 dst_size,
1012 SRC_HEIGHT,
1013 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001014 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001015}
ramelg01bb6877a2022-02-08 09:38:17 +00001016#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001017
ramelg01bb6877a2022-02-08 09:38:17 +00001018#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001019/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1020 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001021 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1022 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1023 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1024 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
1025 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1026 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1027 *
1028 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
1029 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1030 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1031 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1032 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1033 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1034 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1035 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1036 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1037 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1038 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1039 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1040 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1041 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1042 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1043 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1044 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1045 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
1046 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1047 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg01bb6877a2022-02-08 09:38:17 +00001048 * @param[in] SRC_HEIGHT The source tensor's height
1049 * @param[in] DST_WIDTH The destination tensor's width
1050 * @param[in] DST_HEIGHT The destination tensor's height
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001051 */
1052__kernel void winograd_output_transform_1x4_1x5_nhwc(
1053 TENSOR4D_DECLARATION(src),
1054 TENSOR4D_DECLARATION(dst),
1055#if defined(HAS_BIAS)
1056 VECTOR_DECLARATION(bias),
1057#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001058 int dst_size,
1059 const int SRC_HEIGHT,
1060 const int DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001061 const int DST_HEIGHT)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001062{
1063 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1064 src_stride_x,
1065 src_step_x,
1066 src_stride_y,
1067 src_step_y,
1068 src_stride_z,
1069 src_step_z,
1070 src_stride_w,
1071 src_step_w,
1072 src_offset_first_element_in_bytes,
1073 dst_ptr,
1074 dst_stride_x,
1075 dst_step_x,
1076 dst_stride_y,
1077 dst_step_y,
1078 dst_stride_z,
1079 dst_step_z,
1080 dst_stride_w,
1081 dst_step_w,
1082 dst_offset_first_element_in_bytes,
1083#if defined(HAS_BIAS)
1084 bias_ptr,
1085 bias_stride_x,
1086 bias_step_x,
1087 bias_offset_first_element_in_bytes,
1088#endif // defined(HAS_BIAS)
ramelg01bb6877a2022-02-08 09:38:17 +00001089 dst_size,
1090 SRC_HEIGHT,
1091 DST_WIDTH,
Gian Marco Iodice0c687042022-06-14 15:13:16 +01001092 DST_HEIGHT);
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001093}
ramelg01bb6877a2022-02-08 09:38:17 +00001094#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001095#endif // defined(VEC_SIZE) && VEC_SIZE == 4
1096#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1097#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)