blob: 7341336b92092b15765379e9cc80c18ad6dd2f02 [file] [log] [blame]
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001/*
Gian Marco Iodice905a3c12023-04-14 12:20:58 +01002 * Copyright (c) 2018-2023 Arm Limited.
Adnan AlSinan7075fe22021-07-05 13:12:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
26
Gian Marco Iodice905a3c12023-04-14 12:20:58 +010027#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
28 ({ \
29 comm_fact.s0 = tmp.s2 - (DATA_TYPE)4.25f * tmp.s4 + tmp.s6; \
30 comm_fact.s1 = tmp.s1 - (DATA_TYPE)4.25f * tmp.s3 + tmp.s5; \
31 comm_fact.s2 = (DATA_TYPE)2.5f * tmp.s3; \
32 comm_fact.s3 = (DATA_TYPE)0.5f * tmp.s1 + (DATA_TYPE)2.f * tmp.s5 - comm_fact.s2; \
33 comm_fact.s4 = (DATA_TYPE)0.25f * tmp.s2 - (DATA_TYPE)1.25f * tmp.s4 + tmp.s6; \
34 comm_fact.s5 = (DATA_TYPE)4.f * tmp.s2 + tmp.s6 - (DATA_TYPE)5.f * tmp.s4; \
35 comm_fact.s6 = (DATA_TYPE)2.f * tmp.s1 + (DATA_TYPE)0.5f * tmp.s5 - comm_fact.s2; \
Adnan AlSinan7075fe22021-07-05 13:12:52 +010036 \
Gian Marco Iodice905a3c12023-04-14 12:20:58 +010037 out.s0 = tmp.s0 - tmp.s6 + (DATA_TYPE)5.25f * tmp.s4 - (DATA_TYPE)5.25f * tmp.s2; \
38 out.s1 = comm_fact.s0 + comm_fact.s1; \
39 out.s2 = comm_fact.s0 - comm_fact.s1; \
40 out.s3 = comm_fact.s3 + comm_fact.s4; \
41 out.s4 = comm_fact.s4 - comm_fact.s3; \
42 out.s5 = comm_fact.s5 + comm_fact.s6; \
43 out.s6 = comm_fact.s5 - comm_fact.s6; \
44 out.s7 = tmp.s7 - tmp.s1 + (DATA_TYPE)5.25f * tmp.s3 - (DATA_TYPE)5.25f * tmp.s5; \
Adnan AlSinan7075fe22021-07-05 13:12:52 +010045 })
46
Gian Marco Iodice905a3c12023-04-14 12:20:58 +010047#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
48 ({ \
49 comm_fact.s0 = (DATA_TYPE)36.0f * tmp.s2 - (DATA_TYPE)13.0f * tmp.s4 + tmp.s6; \
50 comm_fact.s1 = (DATA_TYPE)36.0f * tmp.s1 - (DATA_TYPE)13.0f * tmp.s3 + (DATA_TYPE)1.0f * tmp.s5; \
51 comm_fact.s2 = (DATA_TYPE)9.0f * tmp.s2 - (DATA_TYPE)10.0f * tmp.s4 + tmp.s6; \
52 comm_fact.s3 = (DATA_TYPE)18.0f * tmp.s1 - (DATA_TYPE)20.0f * tmp.s3 + (DATA_TYPE)2.0f * tmp.s5; \
53 comm_fact.s4 = (DATA_TYPE)4.0f * tmp.s2 - (DATA_TYPE)5.0f * tmp.s4 + tmp.s6; \
54 comm_fact.s5 = (DATA_TYPE)12.0f * tmp.s1 - (DATA_TYPE)15.0f * tmp.s3 + (DATA_TYPE)3.0f * tmp.s5; \
55 out.s0 = -(DATA_TYPE)36.0f * tmp.s0 + (DATA_TYPE)49.0f * tmp.s2 + -(DATA_TYPE)14.0f * tmp.s4 + tmp.s6; \
56 out.s1 = comm_fact.s0 - comm_fact.s1; \
57 out.s2 = comm_fact.s0 + comm_fact.s1; \
58 out.s3 = comm_fact.s2 - comm_fact.s3; \
59 out.s4 = comm_fact.s2 + comm_fact.s3; \
60 out.s5 = comm_fact.s4 - comm_fact.s5; \
61 out.s6 = comm_fact.s4 + comm_fact.s5; \
62 out.s7 = -(DATA_TYPE)36.0f * tmp.s1 + (DATA_TYPE)0.0f * tmp.s2 + (DATA_TYPE)49.0f * tmp.s3 - (DATA_TYPE)14.0f * tmp.s5 + tmp.s7; \
Adnan AlSinan7075fe22021-07-05 13:12:52 +010063 })
64
ramelg012a86a302022-02-04 20:49:14 +000065#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010066
ramelg012a86a302022-02-04 20:49:14 +000067#if defined(NHWC)
68#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010069//! @cond Doxygen_Suppress
70/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
71 *
72 * @note Data layout supported: NHWC
73 * @note Data type supported: F32/F16
74 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +010075 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
76 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
77 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
78 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
79 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
80 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
81 *
82 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
83 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
84 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
85 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
86 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
87 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
88 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
89 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
90 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
91 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
92 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
93 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
94 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
96 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
98 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
99 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
100 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
101 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000102 * @param[in] _ISRC_WIDTH The src tensor's width
103 * @param[in] _ISRC_HEIGHT The src tensor's height
104 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
105 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100106 */
107//! @endcond
108__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
109 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000110 TENSOR4D(dst, BUFFER),
111 const int _ISRC_WIDTH,
112 const int _ISRC_HEIGHT,
113 const int _INUM_TILES_X,
114 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100115{
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100116 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
117 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
118#if defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100119 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100120#else // defined(IS_BATCHED)
121 const int bout = 0; // BATCH SIZE IDX
122#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100123
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100124 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
125 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
126 x -= PAD_LEFT;
127 y -= PAD_TOP;
128
129#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
130
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100131 TILE(DATA_TYPE, 6, N0, in);
132 TILE(DATA_TYPE, 6, N0, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100133
134 // Initialize the input tile
135 LOOP_UNROLLING(int, i, 0, 1, 6,
136 {
137 in[i].v = 0;
138 })
139
140#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100141 T_LOAD_NHWC(DATA_TYPE, 1, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100142#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100143 T_LOAD_NHWC(DATA_TYPE, 6, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100144#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
145
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100146 TILE(DATA_TYPE, 6, N0, com);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100147
148 LOOP_UNROLLING(int, i, 0, 1, 6,
149 {
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100150 in[i].v *= (DATA_TYPE)4.0f;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100151 })
152
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100153 com[0].v = in[2].v - (DATA_TYPE)4.f * in[0].v;
154 com[1].v = in[3].v - (DATA_TYPE)4.f * in[1].v;
155 com[2].v = in[4].v - (DATA_TYPE)4.f * in[2].v;
156 com[3].v = in[5].v - (DATA_TYPE)4.f * in[3].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100157 com[4].v = in[3].v - in[1].v;
158 com[4].v = com[4].v + com[4].v;
159 com[5].v = in[4].v - in[2].v;
160
161 out[0].v = com[2].v - com[0].v;
162 out[1].v = com[2].v + com[1].v;
163 out[2].v = com[2].v - com[1].v;
164 out[3].v = com[5].v + com[4].v;
165 out[4].v = com[5].v - com[4].v;
166 out[5].v = com[3].v - com[1].v;
167
168 TILE(uint, 6, 1, dst_indirect_y);
169
170 LOOP_UNROLLING(int, i, 0, 1, 6,
171 {
172 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
173 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
174 })
175
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100176 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100177
178#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
179
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100180 TILE(DATA_TYPE, 36, N0, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100181
182 // Initialize the input tile
183 LOOP_UNROLLING(int, i, 0, 1, 36,
184 {
185 in[i].v = 0;
186 })
187
188 // Load the tile from a NHWC tensor
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100189 T_LOAD_NHWC(DATA_TYPE, 6, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100190
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100191 TILE(DATA_TYPE, 6, N0, com);
192 TILE(DATA_TYPE, 36, N0, tmp);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100193
194 LOOP_UNROLLING(int, i, 0, 1, 6,
195 {
196 com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
197 com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
198 com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
199 com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
200 com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
201 com[4].v = com[4].v + com[4].v;
202 com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
203 tmp[i + 0 * 6].v = com[2].v - com[0].v;
204 tmp[i + 1 * 6].v = com[2].v + com[1].v;
205 tmp[i + 2 * 6].v = com[2].v - com[1].v;
206 tmp[i + 3 * 6].v = com[5].v + com[4].v;
207 tmp[i + 4 * 6].v = com[5].v - com[4].v;
208 tmp[i + 5 * 6].v = com[3].v - com[1].v;
209 })
210
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100211 TILE(DATA_TYPE, 36, N0, out);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100212
213 LOOP_UNROLLING(int, i, 0, 1, 6,
214 {
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100215 com[0].v = tmp[i * 6 + 2].v - (DATA_TYPE)4.f *tmp[i * 6 + 0].v;
216 com[1].v = tmp[i * 6 + 3].v - (DATA_TYPE)4.f *tmp[i * 6 + 1].v;
217 com[2].v = tmp[i * 6 + 4].v - (DATA_TYPE)4.f *tmp[i * 6 + 2].v;
218 com[3].v = tmp[i * 6 + 5].v - (DATA_TYPE)4.f *tmp[i * 6 + 3].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100219 com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
220 com[4].v = com[4].v + com[4].v;
221 com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
222 out[i * 6 + 0].v = com[2].v - com[0].v;
223 out[i * 6 + 1].v = com[2].v + com[1].v;
224 out[i * 6 + 2].v = com[2].v - com[1].v;
225 out[i * 6 + 3].v = com[5].v + com[4].v;
226 out[i * 6 + 4].v = com[5].v - com[4].v;
227 out[i * 6 + 5].v = com[3].v - com[1].v;
228 })
229
230 // Compute destination address
231 TILE(uint, 36, 1, dst_indirect_y);
232
233 LOOP_UNROLLING(int, i, 0, 1, 36,
234 {
235 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
236 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
237 })
238
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100239 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100240#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
241}
ramelg012a86a302022-02-04 20:49:14 +0000242#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100243
ramelg012a86a302022-02-04 20:49:14 +0000244#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100245//! @cond Doxygen_Suppress
246/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
247 *
248 * @note Data layout supported: NHWC
249 * @note Data type supported: F32/F16
250 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100251 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
252 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
253 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
254 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
255 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
256 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
257 *
258 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
259 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
260 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
261 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
262 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
263 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
264 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
265 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
266 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
267 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
268 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
269 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
270 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
271 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
272 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
273 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
274 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
275 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
276 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
277 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000278 * @param[in] _ISRC_WIDTH The src tensor's width
279 * @param[in] _ISRC_HEIGHT The src tensor's height
280 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
281 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100282 */
283//! @endcond
284__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
285 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000286 TENSOR4D(dst, BUFFER),
287 const int _ISRC_WIDTH,
288 const int _ISRC_HEIGHT,
289 const int _INUM_TILES_X,
290 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100291{
292 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
293 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100294#if defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100295 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100296#else // defined(IS_BATCHED)
297 const int bout = 0; // BATCH SIZE IDX
298#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100299
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100300 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
301 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
302 x -= PAD_LEFT;
303 y -= PAD_TOP;
304
305#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
306
307 TILE(DATA_TYPE, 8, 1, in);
308 TILE(DATA_TYPE, 8, 1, out);
309
310 // Initialize the input tile
311 LOOP_UNROLLING(int, i, 0, 1, 8,
312 {
313 in[i].v = 0;
314 })
315
316#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100317 T_LOAD_NHWC(DATA_TYPE, 1, 8, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100318#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100319 T_LOAD_NHWC(DATA_TYPE, 8, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100320#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
321
322 TILE(DATA_TYPE, 1, 8, com);
323
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100324 com[0].s[0] = in[2].v - (DATA_TYPE)4.25f * in[4].v + in[6].v;
325 com[0].s[1] = in[1].v - (DATA_TYPE)4.25f * in[3].v + in[5].v;
326 com[0].s[2] = (DATA_TYPE)0.5f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
327 com[0].s[3] = (DATA_TYPE)0.25f * in[2].v - (DATA_TYPE)1.25f * in[4].v + in[6].v;
328 com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
329 com[0].s[5] = (DATA_TYPE)2.0f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)0.5f * in[5].v;
330 out[0].s[0] = in[0].v - 5.25f * in[2].v + (DATA_TYPE)5.25f * in[4].v - in[6].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100331 out[1].s[0] = com[0].s[0] + com[0].s[1];
332 out[2].s[0] = com[0].s[0] - com[0].s[1];
333 out[3].s[0] = com[0].s[3] + com[0].s[2];
334 out[4].s[0] = com[0].s[3] - com[0].s[2];
335 out[5].s[0] = com[0].s[4] + com[0].s[5];
336 out[6].s[0] = com[0].s[4] - com[0].s[5];
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100337 out[7].s[0] = -in[1].v + (DATA_TYPE)5.25f * in[3].v - (DATA_TYPE)5.25f * in[5].v + in[7].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100338
339 TILE(uint, 8, 1, dst_indirect_y);
340
341 LOOP_UNROLLING(int, i, 0, 1, 8,
342 {
343 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
344 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
345 })
346
347 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
348
349#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
350
351 TILE(DATA_TYPE, 64, 1, in);
352 TILE(DATA_TYPE, 64, 1, out);
353
354 // Initialize the input tile
355 LOOP_UNROLLING(int, i, 0, 1, 64,
356 {
357 in[i].v = 0;
358 })
359
360 // Load the tile from a NHWC tensor
361 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
362
363 TILE(DATA_TYPE, 8, 8, com);
364
365 LOOP_UNROLLING(int, i, 0, 1, 8,
366 {
367 com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
368 com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
369 com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
370 com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
371 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
372 com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
373 com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
374 com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
375 })
376
377 TILE(DATA_TYPE, 8, 8, tmp);
378 tmp[0].v = com[6].v;
379 tmp[1].v = com[0].v + com[1].v;
380 tmp[2].v = com[0].v - com[1].v;
381 tmp[3].v = com[2].v + com[3].v;
382 tmp[4].v = com[2].v - com[3].v;
383 tmp[5].v = com[4].v + com[5].v;
384 tmp[6].v = com[4].v - com[5].v;
385 tmp[7].v = com[7].v;
386
387 LOOP_UNROLLING(int, i, 0, 1, 8,
388 {
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100389 com[0].s[0] = tmp[i].s[2] - (DATA_TYPE)4.25f * tmp[i].s[4] + tmp[i].s[6];
390 com[0].s[1] = tmp[i].s[1] - (DATA_TYPE)4.25f * tmp[i].s[3] + tmp[i].s[5];
391 com[0].s[2] = (DATA_TYPE)0.5f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
392 com[0].s[3] = (DATA_TYPE)0.25f * tmp[i].s[2] - (DATA_TYPE)1.25f * tmp[i].s[4] + tmp[i].s[6];
393 com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
394 com[0].s[5] = (DATA_TYPE)2.0f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)0.5f * tmp[i].s[5];
395 out[i * 8 + 0].s[0] = tmp[i].s[0] - (DATA_TYPE)5.25f * tmp[i].s[2] + (DATA_TYPE)5.25f * tmp[i].s[4] - tmp[i].s[6];
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100396 out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
397 out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
398 out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
399 out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
400 out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
401 out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100402 out[i * 8 + 7].s[0] = -tmp[i].s[1] + (DATA_TYPE)5.25f * tmp[i].s[3] - (DATA_TYPE)5.25f * tmp[i].s[5] + tmp[i].s[7];
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100403 })
404
405 TILE(uint, 64, 1, dst_indirect_y);
406
407 LOOP_UNROLLING(int, i, 0, 1, 64,
408 {
409 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
410 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
411 })
412
413 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
414
415#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
416}
ramelg012a86a302022-02-04 20:49:14 +0000417#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100418
ramelg012a86a302022-02-04 20:49:14 +0000419#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100420//! @cond Doxygen_Suppress
421/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
422 *
423 * @note Data layout supported: NHWC
424 * @note Data type supported: F32/F16
425 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100426 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
427 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
428 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
429 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
430 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
431 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
432 *
433 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
434 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
435 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
436 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
437 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
438 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
439 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
440 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
441 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
442 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
443 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
444 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
445 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
446 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
447 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
448 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
449 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
450 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
451 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
452 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000453 * @param[in] _ISRC_WIDTH The src tensor's width
454 * @param[in] _ISRC_HEIGHT The src tensor's height
455 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
456 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100457 */
458//! @endcond
459__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
460 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000461 TENSOR4D(dst, BUFFER),
462 const int _ISRC_WIDTH,
463 const int _ISRC_HEIGHT,
464 const int _INUM_TILES_X,
465 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100466{
467 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
468 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100469#if defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100470 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100471#else // defined(IS_BATCHED)
472 const int bout = 0; // BATCH SIZE IDX
473#endif // defined(IS_BATCHED)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100474
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100475 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
476 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
477 x -= PAD_LEFT;
478 y -= PAD_TOP;
479
480#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
481
482 TILE(DATA_TYPE, 8, 1, in);
483 TILE(DATA_TYPE, 8, 1, out);
484
485 // Initialize the input tile
486 LOOP_UNROLLING(int, i, 0, 1, 8,
487 {
488 in[i].v = 0;
489 })
490
491#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
492 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
493#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
494 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
495#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
496
497 LOOP_UNROLLING(int, i, 0, 1, 8,
498 {
499 in[i].v *= (DATA_TYPE) - 36.0f;
500 })
501
502 TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
503
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100504 com[0].s[0] = (DATA_TYPE)36.0f * in[2].v - (DATA_TYPE)13.0f * in[4].v + in[6].v;
505 com[0].s[1] = (DATA_TYPE)36.0f * in[1].v - (DATA_TYPE)13.0f * in[3].v + (DATA_TYPE)1.0f * in[5].v;
506 com[0].s[2] = (DATA_TYPE)9.0f * in[2].v - (DATA_TYPE)10.0f * in[4].v + in[6].v;
507 com[0].s[3] = (DATA_TYPE)18.0f * in[1].v - (DATA_TYPE)20.0f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
508 com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
509 com[0].s[5] = (DATA_TYPE)12.0f * in[1].v - (DATA_TYPE)15.0f * in[3].v + (DATA_TYPE)3.0f * in[5].v;
510 out[0].s[0] = (DATA_TYPE) - 36.0f * in[0].v + (DATA_TYPE)49.0f * in[2].v + -(DATA_TYPE)14.0f * in[4].v + in[6].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100511 out[1].s[0] = com[0].s[0] - com[0].s[1];
512 out[2].s[0] = com[0].s[0] + com[0].s[1];
513 out[3].s[0] = com[0].s[2] - com[0].s[3];
514 out[4].s[0] = com[0].s[2] + com[0].s[3];
515 out[5].s[0] = com[0].s[4] - com[0].s[5];
516 out[6].s[0] = com[0].s[4] + com[0].s[5];
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100517 out[7].s[0] = -(DATA_TYPE)36.0f * in[1].v + (DATA_TYPE)0.0f * in[2].v + (DATA_TYPE)49.0f * in[3].v - (DATA_TYPE)14.0f * in[5].v + in[7].v;
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100518
519 TILE(uint, 8, 1, dst_indirect_y);
520
521 LOOP_UNROLLING(int, i, 0, 1, 8,
522 {
523 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
524 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
525 })
526
527 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
528
529#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
530
531 TILE(DATA_TYPE, 64, 1, in);
532 TILE(DATA_TYPE, 64, 1, out);
533
534 // Initialize the input tile
535 LOOP_UNROLLING(int, i, 0, 1, 64,
536 {
537 in[i].v = 0;
538 })
539
540 // Load the tile from a NHWC tensor
541 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
542
543 TILE(DATA_TYPE, 8, 8, com);
544
545 LOOP_UNROLLING(int, i, 0, 1, 8,
546 {
547 com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
548 com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
549 com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
550 com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
551 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
552 com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
553 com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
554 com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
555 })
556
557 TILE(DATA_TYPE, 8, 8, tmp);
558 tmp[0].v = com[6].v;
559 tmp[1].v = com[0].v - com[1].v;
560 tmp[2].v = com[0].v + com[1].v;
561 tmp[3].v = com[2].v - com[3].v;
562 tmp[4].v = com[2].v + com[3].v;
563 tmp[5].v = com[4].v - com[5].v;
564 tmp[6].v = com[4].v + com[5].v;
565 tmp[7].v = com[7].v;
566
567 LOOP_UNROLLING(int, i, 0, 1, 8,
568 {
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100569 com[0].s[0] = (DATA_TYPE)36.0f * tmp[i].s[2] - (DATA_TYPE)13.0f * tmp[i].s[4] + tmp[i].s[6];
570 com[0].s[1] = (DATA_TYPE)36.0f * tmp[i].s[1] - (DATA_TYPE)13.0f * tmp[i].s[3] + (DATA_TYPE)1.0f * tmp[i].s[5];
571 com[0].s[2] = (DATA_TYPE)9.0f * tmp[i].s[2] - (DATA_TYPE)10.0f * tmp[i].s[4] + tmp[i].s[6];
572 com[0].s[3] = (DATA_TYPE)18.0f * tmp[i].s[1] - (DATA_TYPE)20.0f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
573 com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
574 com[0].s[5] = (DATA_TYPE)12.0f * tmp[i].s[1] - (DATA_TYPE)15.0f * tmp[i].s[3] + (DATA_TYPE)3.0f * tmp[i].s[5];
575 out[i * 8 + 0].s[0] = (DATA_TYPE) - 36.0f * tmp[i].s[0] + (DATA_TYPE)49.0f * tmp[i].s[2] + -(DATA_TYPE)14.0f * tmp[i].s[4] + tmp[i].s[6];
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100576 out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
577 out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
578 out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
579 out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
580 out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
581 out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
Gian Marco Iodice905a3c12023-04-14 12:20:58 +0100582 out[i * 8 + 7].s[0] = -(DATA_TYPE)36.0f * tmp[i].s[1] + (DATA_TYPE)0.0f * tmp[i].s[2] + (DATA_TYPE)49.0f * tmp[i].s[3] - (DATA_TYPE)14.0f * tmp[i].s[5] + tmp[i].s[7];
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100583 })
584
585 TILE(uint, 64, 1, dst_indirect_y);
586
587 LOOP_UNROLLING(int, i, 0, 1, 64,
588 {
589 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
590 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
591 })
592
593 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
594
595#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
596}
ramelg012a86a302022-02-04 20:49:14 +0000597#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100598
ramelg012a86a302022-02-04 20:49:14 +0000599#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100600//! @cond Doxygen_Suppress
601/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
602 *
603 * @note Data layout supported: NHWC
604 * @note Data type supported: F32/F16
605 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100606 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
607 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
608 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
609 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
610 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
611 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
612 *
613 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
614 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
615 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
616 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
617 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
618 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
619 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
620 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
621 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
622 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
623 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
624 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
625 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
626 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
627 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
628 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
629 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
630 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
631 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
632 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000633 * @param[in] _ISRC_WIDTH The src tensor's width
634 * @param[in] _ISRC_HEIGHT The src tensor's height
635 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
636 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100637 */
638//! @endcond
639__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
640 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000641 TENSOR4D(dst, BUFFER),
642 const int _ISRC_WIDTH,
643 const int _ISRC_HEIGHT,
644 const int _INUM_TILES_X,
645 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100646{
647 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
648 src_stride_x,
649 src_step_x,
650 src_stride_y,
651 src_step_y,
652 src_stride_z,
653 src_step_z,
654 src_stride_w,
655 src_step_w,
656 src_offset_first_element_in_bytes,
657 dst_ptr,
658 dst_stride_x,
659 dst_step_x,
660 dst_stride_y,
661 dst_step_y,
662 dst_stride_z,
663 dst_step_z,
664 dst_stride_w,
665 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +0000666 dst_offset_first_element_in_bytes,
667 _ISRC_WIDTH,
668 _ISRC_HEIGHT,
669 _INUM_TILES_X,
670 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100671}
ramelg012a86a302022-02-04 20:49:14 +0000672#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100673
ramelg012a86a302022-02-04 20:49:14 +0000674#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100675//! @cond Doxygen_Suppress
676/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
677 *
678 * @note Data layout supported: NHWC
679 * @note Data type supported: F32/F16
680 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100681 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
682 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
683 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
684 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
685 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
686 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
687 *
688 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
689 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
690 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
691 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
692 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
693 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
694 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
695 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
696 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
697 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
698 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
699 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
700 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
701 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
702 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
703 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
704 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
705 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
706 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
707 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000708 * @param[in] _ISRC_WIDTH The src tensor's width
709 * @param[in] _ISRC_HEIGHT The src tensor's height
710 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
711 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100712 */
713//! @endcond
714__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
715 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000716 TENSOR4D(dst, BUFFER),
717 const int _ISRC_WIDTH,
718 const int _ISRC_HEIGHT,
719 const int _INUM_TILES_X,
720 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100721{
722 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
723 src_stride_x,
724 src_step_x,
725 src_stride_y,
726 src_step_y,
727 src_stride_z,
728 src_step_z,
729 src_stride_w,
730 src_step_w,
731 src_offset_first_element_in_bytes,
732 dst_ptr,
733 dst_stride_x,
734 dst_step_x,
735 dst_stride_y,
736 dst_step_y,
737 dst_stride_z,
738 dst_step_z,
739 dst_stride_w,
740 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +0000741 dst_offset_first_element_in_bytes,
742 _ISRC_WIDTH,
743 _ISRC_HEIGHT,
744 _INUM_TILES_X,
745 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100746}
ramelg012a86a302022-02-04 20:49:14 +0000747#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100748
ramelg012a86a302022-02-04 20:49:14 +0000749#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100750//! @cond Doxygen_Suppress
751/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
752 *
753 * @note Data layout supported: NHWC
754 * @note Data type supported: F32/F16
755 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100756 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
757 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
758 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
759 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
760 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
761 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
762 *
763 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
764 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
765 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
766 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
767 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
768 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
769 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
770 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
771 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
772 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
773 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
774 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
775 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
776 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
777 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
778 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
779 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
780 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
781 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
782 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000783 * @param[in] _ISRC_WIDTH The src tensor's width
784 * @param[in] _ISRC_HEIGHT The src tensor's height
785 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
786 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100787 */
788//! @endcond
789__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
790 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000791 TENSOR4D(dst, BUFFER),
792 const int _ISRC_WIDTH,
793 const int _ISRC_HEIGHT,
794 const int _INUM_TILES_X,
795 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100796{
797 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
798 src_stride_x,
799 src_step_x,
800 src_stride_y,
801 src_step_y,
802 src_stride_z,
803 src_step_z,
804 src_stride_w,
805 src_step_w,
806 src_offset_first_element_in_bytes,
807 dst_ptr,
808 dst_stride_x,
809 dst_step_x,
810 dst_stride_y,
811 dst_step_y,
812 dst_stride_z,
813 dst_step_z,
814 dst_stride_w,
815 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +0000816 dst_offset_first_element_in_bytes,
817 _ISRC_WIDTH,
818 _ISRC_HEIGHT,
819 _INUM_TILES_X,
820 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100821}
ramelg012a86a302022-02-04 20:49:14 +0000822#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100823
ramelg012a86a302022-02-04 20:49:14 +0000824#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100825//! @cond Doxygen_Suppress
826/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
827 *
828 * @note Data layout supported: NHWC
829 * @note Data type supported: F32/F16
830 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
ramelg012a86a302022-02-04 20:49:14 +0000831 *
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100832 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
833 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
834 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
835 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
836 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
837 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
838 *
839 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
840 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
841 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
842 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
843 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
844 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
845 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
846 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
847 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
848 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
849 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
850 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
851 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
852 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
853 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
854 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
855 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
856 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
857 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
858 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000859 * @param[in] _ISRC_WIDTH The src tensor's width
860 * @param[in] _ISRC_HEIGHT The src tensor's height
861 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
862 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100863 */
864//! @endcond
865__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
866 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000867 TENSOR4D(dst, BUFFER),
868 const int _ISRC_WIDTH,
869 const int _ISRC_HEIGHT,
870 const int _INUM_TILES_X,
871 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100872{
873 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
874 src_stride_x,
875 src_step_x,
876 src_stride_y,
877 src_step_y,
878 src_stride_z,
879 src_step_z,
880 src_stride_w,
881 src_step_w,
882 src_offset_first_element_in_bytes,
883 dst_ptr,
884 dst_stride_x,
885 dst_step_x,
886 dst_stride_y,
887 dst_step_y,
888 dst_stride_z,
889 dst_step_z,
890 dst_stride_w,
891 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +0000892 dst_offset_first_element_in_bytes,
893 _ISRC_WIDTH,
894 _ISRC_HEIGHT,
895 _INUM_TILES_X,
896 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100897}
ramelg012a86a302022-02-04 20:49:14 +0000898#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100899
ramelg012a86a302022-02-04 20:49:14 +0000900#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100901//! @cond Doxygen_Suppress
902/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
903 *
904 * @note Data layout supported: NHWC
905 * @note Data type supported: F32/F16
906 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100907 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
908 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
909 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
910 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
911 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
912 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
913 *
914 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
915 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
916 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
917 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
918 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
919 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
920 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
921 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
922 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
923 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
924 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
925 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
926 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
927 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
928 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
929 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
930 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
931 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
932 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
933 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +0000934 * @param[in] _ISRC_WIDTH The src tensor's width
935 * @param[in] _ISRC_HEIGHT The src tensor's height
936 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
937 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100938 */
939//! @endcond
940__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
941 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +0000942 TENSOR4D(dst, BUFFER),
943 const int _ISRC_WIDTH,
944 const int _ISRC_HEIGHT,
945 const int _INUM_TILES_X,
946 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100947{
948 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
949 src_stride_x,
950 src_step_x,
951 src_stride_y,
952 src_step_y,
953 src_stride_z,
954 src_step_z,
955 src_stride_w,
956 src_step_w,
957 src_offset_first_element_in_bytes,
958 dst_ptr,
959 dst_stride_x,
960 dst_step_x,
961 dst_stride_y,
962 dst_step_y,
963 dst_stride_z,
964 dst_step_z,
965 dst_stride_w,
966 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +0000967 dst_offset_first_element_in_bytes,
968 _ISRC_WIDTH,
969 _ISRC_HEIGHT,
970 _INUM_TILES_X,
971 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100972}
ramelg012a86a302022-02-04 20:49:14 +0000973#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100974
ramelg012a86a302022-02-04 20:49:14 +0000975#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100976//! @cond Doxygen_Suppress
977/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
978 *
979 * @note Data layout supported: NHWC
980 * @note Data type supported: F32/F16
981 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
Adnan AlSinan7075fe22021-07-05 13:12:52 +0100982 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
983 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
984 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
985 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
986 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
987 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
988 *
989 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
990 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
991 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
992 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
993 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
994 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
995 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
996 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
997 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
998 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
999 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1000 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1001 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1002 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1003 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1004 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1005 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1006 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1007 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1008 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
ramelg012a86a302022-02-04 20:49:14 +00001009 * @param[in] _ISRC_WIDTH The src tensor's width
1010 * @param[in] _ISRC_HEIGHT The src tensor's height
1011 * @param[in] _INUM_TILES_X The number of tiles in the X dimension
1012 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001013 */
1014//! @endcond
1015__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
1016 TENSOR4D(src, BUFFER),
ramelg012a86a302022-02-04 20:49:14 +00001017 TENSOR4D(dst, BUFFER),
1018 const int _ISRC_WIDTH,
1019 const int _ISRC_HEIGHT,
1020 const int _INUM_TILES_X,
1021 const int _INUM_TILES_Y)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001022{
1023 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
1024 src_stride_x,
1025 src_step_x,
1026 src_stride_y,
1027 src_step_y,
1028 src_stride_z,
1029 src_step_z,
1030 src_stride_w,
1031 src_step_w,
1032 src_offset_first_element_in_bytes,
1033 dst_ptr,
1034 dst_stride_x,
1035 dst_step_x,
1036 dst_stride_y,
1037 dst_step_y,
1038 dst_stride_z,
1039 dst_step_z,
1040 dst_stride_w,
1041 dst_step_w,
ramelg012a86a302022-02-04 20:49:14 +00001042 dst_offset_first_element_in_bytes,
1043 _ISRC_WIDTH,
1044 _ISRC_HEIGHT,
1045 _INUM_TILES_X,
1046 _INUM_TILES_Y);
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001047}
ramelg012a86a302022-02-04 20:49:14 +00001048#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
1049#endif // defined(NHWC)
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001050#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)