blob: 0fcd04e7133c8a261446be9c1839c9c84cdcd7e2 [file] [log] [blame]
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001/*
2 * Copyright (c) 2018-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "activation_float_helpers.h"
25#include "helpers.h"
26#include "tile_helpers.h"
27
28#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
29#if defined(VEC_SIZE) && VEC_SIZE == 2
30/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
31 *
32 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
33 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
34 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
35 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
36 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
37 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
38 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
39 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
40 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
41 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
42 *
43 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
44 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
45 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
47 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
49 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
51 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
52 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
53 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
54 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
55 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
56 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
57 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
58 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
59 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
60 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
61 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
62 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
63 */
64__kernel void winograd_output_transform_2x2_7x7_nhwc(
65 TENSOR4D(src, BUFFER),
66 TENSOR4D(dst, BUFFER),
67#if defined(HAS_BIAS)
68 VECTOR_DECLARATION(bias),
69#endif // defined(HAS_BIAS)
70 int dst_size)
71{
72#define _ISRC_HEIGHT SRC_HEIGHT
73#define _IDST_WIDTH DST_WIDTH
74#define _IDST_HEIGHT DST_HEIGHT
75#define _INUM_TILES_X NUM_TILES_X
76
77 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
78 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
79 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
80
81 int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
82 int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
83
84#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
85 TILE(DATA_TYPE, 8, N0, in);
86 TILE(DATA_TYPE, 2, N0, out);
87 TILE(uint, 8, 1, src_indirect_y);
88
89 // Calculate the indirect Y for the source tensor
90 LOOP_UNROLLING(int, i, 0, 1, 8,
91 {
92 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
93 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
94 })
95
96 // Initialize the input tile
97 LOOP_UNROLLING(int, i, 0, 1, 8,
98 {
99 in[i].v = 0;
100 })
101
102 // Load the values across the 8 channels to compose the 8x1 tile
103 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
104
105 // Compute out0 and out01
106 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
107 out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
108
109#if defined(HAS_BIAS)
110 // Add bias
111 TILE(DATA_TYPE, 1, N0, b);
112
113 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
114
115 T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
116#endif // defined(HAS_BIAS)
117
118 T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
119
120 TILE(uint, 2, 1, dst_indirect_y);
121
122#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
123 LOOP_UNROLLING(int, yk, 0, 1, 2,
124 {
125 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
126 dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
127 })
128#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
129 LOOP_UNROLLING(int, xk, 0, 1, 2,
130 {
131 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
132 dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
133 })
134#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
135
136 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
137 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
138
139#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
140
141 TILE(DATA_TYPE, 64, N0, in);
142 TILE(DATA_TYPE, 4, N0, out);
143 TILE(DATA_TYPE, 16, N0, tmp);
144 TILE(uint, 64, 1, src_indirect_y);
145
146 // Calculate the indirect Y for the source tensor
147 LOOP_UNROLLING(int, i, 0, 1, 64,
148 {
149 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
150 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
151 })
152
153 // Initialize the input tile
154 LOOP_UNROLLING(int, i, 0, 1, 64,
155 {
156 in[i].v = 0;
157 })
158
159 // Load the values across the 64 channels to compose the 8x8 tile
160 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
161
162 LOOP_UNROLLING(int, i, 0, 1, 8,
163 {
164 tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
165 tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
166 })
167
168 // Compute the 2x2 output tile
169 LOOP_UNROLLING(int, i, 0, 1, 2,
170 {
171 out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
172 out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
173 })
174
175#if defined(HAS_BIAS)
176 // Add bias
177 TILE(DATA_TYPE, 1, N0, b);
178
179 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
180
181 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
182#endif // defined(HAS_BIAS)
183
184 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
185
186 TILE(uint, 4, 1, dst_indirect_y);
187
188 // Calculate the destination indirect Y
189 LOOP_UNROLLING(int, yk, 0, 1, 2,
190 {
191 LOOP_UNROLLING(int, xk, 0, 1, 2,
192 {
193 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
194 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
195 dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
196 dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
197 })
198 })
199
200 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
201 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
202#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
203}
204#endif // defined(VEC_SIZE) && VEC_SIZE == 2
205
206#if defined(VEC_SIZE) && VEC_SIZE == 4
207/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
208 *
209 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
210 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
211 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
212 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
213 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
214 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
215 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
216 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
217 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
218 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
219 *
220 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
221 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
222 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
223 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
224 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
225 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
226 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
227 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
228 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
229 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
230 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
231 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
232 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
233 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
234 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
235 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
236 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
237 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
238 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
239 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
240 * @param[in] dst_size Size of the destination tensor, minus the last padding
241 */
242__kernel void winograd_output_transform_4x4_3x3_nhwc(
243 TENSOR4D(src, BUFFER),
244 TENSOR4D(dst, BUFFER),
245#if defined(HAS_BIAS)
246 VECTOR_DECLARATION(bias),
247#endif // defined(HAS_BIAS)
248 int dst_size)
249{
250 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
251 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
252 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
253
254#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
255
256 TILE(DATA_TYPE, 6, N0, in);
257 TILE(DATA_TYPE, 4, N0, out);
258 TILE(uint, 6, 1, src_indirect_y);
259
260 LOOP_UNROLLING(int, i, 0, 1, 6,
261 {
262 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
263 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
264 })
265
266 // Initialize the input tile
267 LOOP_UNROLLING(int, i, 0, 1, 6,
268 {
269 in[i].v = 0;
270 })
271
272 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
273 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
274
275 // Compute out00, out01, out02 and out03
276 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
277 out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
278 out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
279 out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
280
281#if defined(HAS_BIAS)
282 TILE(DATA_TYPE, 1, N0, b);
283
284 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
285
286 // c = c + bias[broadcasted]
287 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
288#endif // HAS_BIAS
289
290 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
291 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
292
293 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
294
295 TILE(uint, 4, 1, dst_indirect_y);
296
297 // Calculate the destination indirect Y
298#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
299 LOOP_UNROLLING(int, yk, 0, 1, 4,
300 {
301 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
302 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
303 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
304 })
305#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
306 LOOP_UNROLLING(int, xk, 0, 1, 4,
307 {
308 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
309 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
310 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
311 })
312#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
313
314 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
315 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
316
317#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
318
319 // Calculate the indirect Y for the source tensor
320 TILE(DATA_TYPE, 36, N0, in);
321 TILE(DATA_TYPE, 4, N0, tmp);
322 TILE(uint, 36, 1, src_indirect_y);
323
324 LOOP_UNROLLING(int, i, 0, 1, 36,
325 {
326 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
327 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
328 })
329
330 // Initialize the input tile
331 LOOP_UNROLLING(int, i, 0, 1, 36,
332 {
333 in[i].v = 0;
334 })
335
336 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
337 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
338
339 LOOP_UNROLLING(int, i, 0, 1, 6,
340 {
341 tmp[0].v = in[6 + i].v + in[12 + i].v;
342 tmp[1].v = in[6 + i].v - in[12 + i].v;
343 tmp[2].v = in[18 + i].v + in[24 + i].v;
344 tmp[3].v = in[18 + i].v - in[24 + i].v;
345 tmp[3].v = tmp[3].v + tmp[3].v;
346 in[i].v = in[i].v + tmp[0].v + tmp[2].v;
347 in[6 + i].v = tmp[3].v + tmp[1].v;
348 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
349 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
350 })
351
352 // Compute the output tile
353 TILE(DATA_TYPE, 16, N0, out);
354
355 LOOP_UNROLLING(int, i, 0, 1, 4,
356 {
357 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
358 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
359 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
360 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
361 tmp[3].v = tmp[3].v + tmp[3].v;
362 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
363 out[4 * i + 1].v = tmp[3].v + tmp[1].v;
364 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
365 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
366 })
367
368#if defined(HAS_BIAS)
369 TILE(DATA_TYPE, 1, N0, b);
370
371 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
372
373 // c = c + bias[broadcasted]
374 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
375#endif // HAS_BIAS
376
377 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
378 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
379
380 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
381
382 TILE(uint, 16, 1, dst_indirect_y);
383
384 // Calculate the destination indirect Y
385 LOOP_UNROLLING(int, yk, 0, 1, 4,
386 {
387 LOOP_UNROLLING(int, xk, 0, 1, 4,
388 {
389 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
390 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
391 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
392 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
393 })
394 })
395
396 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
397 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
398#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
399}
400
401/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
402 *
403 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
404 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
405 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
406 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
407 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
408 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
409 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
410 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
411 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
412 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
413 *
414 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
415 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
416 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
417 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
418 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
419 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
420 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
421 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
422 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
423 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
424 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
425 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
426 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
427 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
428 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
429 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
430 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
431 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
432 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
433 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
434 */
435__kernel void winograd_output_transform_4x4_5x5_nhwc(
436 TENSOR4D(src, BUFFER),
437 TENSOR4D(dst, BUFFER),
438#if defined(HAS_BIAS)
439 VECTOR_DECLARATION(bias),
440#endif // defined(HAS_BIAS)
441 int dst_size)
442{
443 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
444 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
445 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
446
447#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
448 TILE(DATA_TYPE, 8, N0, in);
449 TILE(DATA_TYPE, 4, N0, out);
450 TILE(DATA_TYPE, 4, N0, tmp);
451 TILE(uint, 8, 1, src_indirect_y);
452
453 LOOP_UNROLLING(int, i, 0, 1, 8,
454 {
455 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
456 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
457 })
458
459 // Initialize the input tile
460 LOOP_UNROLLING(int, i, 0, 1, 8,
461 {
462 in[i].v = 0;
463 })
464
465 // "in" contains 1x8 or 8x1 tile here
466 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
467
468 // A^T * in, and in this degenerate case out consists of 1 column/row
469 tmp[0].v = in[1].v - in[2].v;
470 tmp[1].v = 2.0f * (in[3].v - in[4].v);
471 tmp[2].v = 2.0f * (in[5].v + in[6].v);
472 tmp[3].v = in[3].v + in[4].v;
473 out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
474 out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
475 out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
476 out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
477
478#if defined(HAS_BIAS)
479 TILE(DATA_TYPE, 1, N0, b);
480
481 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
482
483 // c = c + bias[broadcasted]
484 T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
485#endif // HAS_BIAS
486
487 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
488 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
489
490 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
491
492 TILE(uint, 4, 1, dst_indirect_y);
493
494 // Calculate the destination indirect Y
495#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
496 LOOP_UNROLLING(int, yk, 0, 1, 4,
497 {
498 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
499 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
500 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
501 })
502#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
503 LOOP_UNROLLING(int, xk, 0, 1, 4,
504 {
505 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
506 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
507 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
508 })
509#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
510
511 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
512 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
513
514#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
515 // Calculate the indirect Y for the source tensor
516 TILE(DATA_TYPE, 64, N0, in);
517 TILE(DATA_TYPE, 6, N0, tmp);
518 TILE(uint, 64, 1, src_indirect_y);
519
520 LOOP_UNROLLING(int, i, 0, 1, 64,
521 {
522 src_indirect_y[i].v = mout + i *SRC_HEIGHT;
523 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
524 })
525
526 // Initialize the input tile
527 LOOP_UNROLLING(int, i, 0, 1, 64,
528 {
529 in[i].v = 0;
530 })
531
532 // "in" here is 8x8 tile
533 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
534
535 // A^T * in
536 LOOP_UNROLLING(int, i, 0, 1, 8,
537 {
538 tmp[0].v = in[8 + i].v + in[16 + i].v;
539 tmp[1].v = in[8 + i].v - in[16 + i].v;
540 tmp[2].v = in[24 + i].v + in[32 + i].v;
541 tmp[3].v = in[24 + i].v - in[32 + i].v;
542 tmp[3].v = tmp[3].v + tmp[3].v;
543 tmp[4].v = in[40 + i].v + in[48 + i].v;
544 tmp[4].v = tmp[4].v + tmp[4].v;
545 tmp[5].v = in[40 + i].v - in[48 + i].v;
546
547 // 4x8 matrix as a result
548 in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
549 in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
550 in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
551 in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
552 })
553
554 // Compute the output tile
555 TILE(DATA_TYPE, 16, N0, out);
556
557 // in * A, with in = A^T * in as above
558 LOOP_UNROLLING(int, i, 0, 1, 4,
559 {
560 tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
561 tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
562 tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
563 tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
564 tmp[3].v = tmp[3].v + tmp[3].v;
565 tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
566 tmp[4].v = tmp[4].v + tmp[4].v;
567 tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
568
569 // 4x4 tile
570 out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
571 out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
572 out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
573 out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
574 })
575
576#if defined(HAS_BIAS)
577 TILE(DATA_TYPE, 1, N0, b);
578
579 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
580
581 // c = c + bias[broadcasted]
582 T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
583#endif // HAS_BIAS
584
585 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
586 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
587
588 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
589
590 TILE(uint, 16, 1, dst_indirect_y);
591
592 // Calculate the destination indirect Y
593 LOOP_UNROLLING(int, yk, 0, 1, 4,
594 {
595 LOOP_UNROLLING(int, xk, 0, 1, 4,
596 {
597 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
598 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
599 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
600 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
601 })
602 })
603
604 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
605 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
606#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
607}
608#endif // defined(VEC_SIZE) && VEC_SIZE == 4
609
610#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
611#if defined(VEC_SIZE) && VEC_SIZE == 2
612/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
613 *
614 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
615 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
616 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
617 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
618 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
619 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
620 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
621 *
622 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
623 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
624 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
625 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
626 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
627 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
628 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
629 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
630 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
631 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
632 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
633 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
634 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
635 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
636 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
637 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
638 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
639 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
640 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
641 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
642 */
643__kernel void winograd_output_transform_2x1_7x1_nhwc(
644 TENSOR4D_DECLARATION(src),
645 TENSOR4D_DECLARATION(dst),
646#if defined(HAS_BIAS)
647 VECTOR_DECLARATION(bias),
648#endif // defined(HAS_BIAS)
649 int dst_size)
650{
651 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
652 src_stride_x,
653 src_step_x,
654 src_stride_y,
655 src_step_y,
656 src_stride_z,
657 src_step_z,
658 src_stride_w,
659 src_step_w,
660 src_offset_first_element_in_bytes,
661 dst_ptr,
662 dst_stride_x,
663 dst_step_x,
664 dst_stride_y,
665 dst_step_y,
666 dst_stride_z,
667 dst_step_z,
668 dst_stride_w,
669 dst_step_w,
670 dst_offset_first_element_in_bytes,
671#if defined(HAS_BIAS)
672 bias_ptr,
673 bias_stride_x,
674 bias_step_x,
675 bias_offset_first_element_in_bytes,
676#endif // defined(HAS_BIAS)
677 dst_size);
678}
679#endif // defined(VEC_SIZE) && VEC_SIZE == 2
680
681#if defined(VEC_SIZE) && VEC_SIZE == 4
682
683/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
684 *
685 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
686 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
687 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
688 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
689 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
690 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
691 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
692 *
693 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
694 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
695 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
696 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
697 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
698 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
699 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
700 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
701 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
702 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
703 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
704 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
705 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
706 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
707 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
708 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
709 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
710 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
711 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
712 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
713 */
714__kernel void winograd_output_transform_4x1_3x1_nhwc(
715 TENSOR4D_DECLARATION(src),
716 TENSOR4D_DECLARATION(dst),
717#if defined(HAS_BIAS)
718 VECTOR_DECLARATION(bias),
719#endif // defined(HAS_BIAS)
720 int dst_size)
721{
722 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
723 src_stride_x,
724 src_step_x,
725 src_stride_y,
726 src_step_y,
727 src_stride_z,
728 src_step_z,
729 src_stride_w,
730 src_step_w,
731 src_offset_first_element_in_bytes,
732 dst_ptr,
733 dst_stride_x,
734 dst_step_x,
735 dst_stride_y,
736 dst_step_y,
737 dst_stride_z,
738 dst_step_z,
739 dst_stride_w,
740 dst_step_w,
741 dst_offset_first_element_in_bytes,
742#if defined(HAS_BIAS)
743 bias_ptr,
744 bias_stride_x,
745 bias_step_x,
746 bias_offset_first_element_in_bytes,
747#endif // defined(HAS_BIAS)
748 dst_size);
749}
750
751/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
752 *
753 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
754 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
755 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
756 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
757 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
758 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
759 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
760 *
761 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
762 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
763 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
764 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
765 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
766 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
767 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
768 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
769 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
770 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
771 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
772 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
773 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
774 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
775 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
776 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
777 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
778 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
779 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
780 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
781 */
782__kernel void winograd_output_transform_4x1_5x1_nhwc(
783 TENSOR4D_DECLARATION(src),
784 TENSOR4D_DECLARATION(dst),
785#if defined(HAS_BIAS)
786 VECTOR_DECLARATION(bias),
787#endif // defined(HAS_BIAS)
788 int dst_size)
789{
790 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
791 src_stride_x,
792 src_step_x,
793 src_stride_y,
794 src_step_y,
795 src_stride_z,
796 src_step_z,
797 src_stride_w,
798 src_step_w,
799 src_offset_first_element_in_bytes,
800 dst_ptr,
801 dst_stride_x,
802 dst_step_x,
803 dst_stride_y,
804 dst_step_y,
805 dst_stride_z,
806 dst_step_z,
807 dst_stride_w,
808 dst_step_w,
809 dst_offset_first_element_in_bytes,
810#if defined(HAS_BIAS)
811 bias_ptr,
812 bias_stride_x,
813 bias_step_x,
814 bias_offset_first_element_in_bytes,
815#endif // defined(HAS_BIAS)
816 dst_size);
817}
818#endif // defined(VEC_SIZE) && VEC_SIZE == 4
819#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
820
821#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
822#if defined(VEC_SIZE) && VEC_SIZE == 2
823/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
824 *
825 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
826 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
827 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
828 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
829 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
830 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
831 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
832 *
833 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
834 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
835 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
836 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
837 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
838 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
839 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
840 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
841 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
842 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
843 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
844 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
845 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
846 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
847 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
848 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
849 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
850 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
851 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
852 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
853 */
854__kernel void winograd_output_transform_1x2_1x7_nhwc(
855 TENSOR4D_DECLARATION(src),
856 TENSOR4D_DECLARATION(dst),
857#if defined(HAS_BIAS)
858 VECTOR_DECLARATION(bias),
859#endif // defined(HAS_BIAS)
860 int dst_size)
861{
862 winograd_output_transform_2x2_7x7_nhwc(src_ptr,
863 src_stride_x,
864 src_step_x,
865 src_stride_y,
866 src_step_y,
867 src_stride_z,
868 src_step_z,
869 src_stride_w,
870 src_step_w,
871 src_offset_first_element_in_bytes,
872 dst_ptr,
873 dst_stride_x,
874 dst_step_x,
875 dst_stride_y,
876 dst_step_y,
877 dst_stride_z,
878 dst_step_z,
879 dst_stride_w,
880 dst_step_w,
881 dst_offset_first_element_in_bytes,
882#if defined(HAS_BIAS)
883 bias_ptr,
884 bias_stride_x,
885 bias_step_x,
886 bias_offset_first_element_in_bytes,
887#endif // defined(HAS_BIAS)
888 dst_size);
889}
890#endif // defined(VEC_SIZE) && VEC_SIZE == 2
891
892#if defined(VEC_SIZE) && VEC_SIZE == 4
893/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
894 *
895 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
896 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
897 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
898 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
899 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
900 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
901 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
902 *
903 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
904 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
905 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
906 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
907 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
908 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
909 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
910 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
911 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
912 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
913 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
914 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
915 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
916 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
917 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
918 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
919 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
920 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
921 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
922 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
923 */
924__kernel void winograd_output_transform_1x4_1x3_nhwc(
925 TENSOR4D_DECLARATION(src),
926 TENSOR4D_DECLARATION(dst),
927#if defined(HAS_BIAS)
928 VECTOR_DECLARATION(bias),
929#endif // defined(HAS_BIAS)
930 int dst_size)
931{
932 winograd_output_transform_4x4_3x3_nhwc(src_ptr,
933 src_stride_x,
934 src_step_x,
935 src_stride_y,
936 src_step_y,
937 src_stride_z,
938 src_step_z,
939 src_stride_w,
940 src_step_w,
941 src_offset_first_element_in_bytes,
942 dst_ptr,
943 dst_stride_x,
944 dst_step_x,
945 dst_stride_y,
946 dst_step_y,
947 dst_stride_z,
948 dst_step_z,
949 dst_stride_w,
950 dst_step_w,
951 dst_offset_first_element_in_bytes,
952#if defined(HAS_BIAS)
953 bias_ptr,
954 bias_stride_x,
955 bias_step_x,
956 bias_offset_first_element_in_bytes,
957#endif // defined(HAS_BIAS)
958 dst_size);
959}
960
961/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
962 *
963 * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
964 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
965 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
966 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
967 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
968 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
969 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
970 *
971 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
972 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
973 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
974 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
975 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
976 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
977 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
978 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
979 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
980 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
981 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
982 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
983 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
984 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
985 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
986 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
987 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
988 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
989 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
990 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
991 */
992__kernel void winograd_output_transform_1x4_1x5_nhwc(
993 TENSOR4D_DECLARATION(src),
994 TENSOR4D_DECLARATION(dst),
995#if defined(HAS_BIAS)
996 VECTOR_DECLARATION(bias),
997#endif // defined(HAS_BIAS)
998 int dst_size)
999{
1000 winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1001 src_stride_x,
1002 src_step_x,
1003 src_stride_y,
1004 src_step_y,
1005 src_stride_z,
1006 src_step_z,
1007 src_stride_w,
1008 src_step_w,
1009 src_offset_first_element_in_bytes,
1010 dst_ptr,
1011 dst_stride_x,
1012 dst_step_x,
1013 dst_stride_y,
1014 dst_step_y,
1015 dst_stride_z,
1016 dst_step_z,
1017 dst_stride_w,
1018 dst_step_w,
1019 dst_offset_first_element_in_bytes,
1020#if defined(HAS_BIAS)
1021 bias_ptr,
1022 bias_stride_x,
1023 bias_step_x,
1024 bias_offset_first_element_in_bytes,
1025#endif // defined(HAS_BIAS)
1026 dst_size);
1027}
1028#endif // defined(VEC_SIZE) && VEC_SIZE == 4
1029#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1030#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)