blob: 4865982a552737d665f0774d32f8652eda605ff9 [file] [log] [blame]
Adnan AlSinan7075fe22021-07-05 13:12:52 +01001/*
2 * Copyright (c) 2018-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
26
27#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
28 ({ \
29 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
30 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
31 comm_fact.s2 = 2.5f * tmp.s3; \
32 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
33 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
34 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
35 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
36 \
37 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
38 out.s1 = comm_fact.s0 + comm_fact.s1; \
39 out.s2 = comm_fact.s0 - comm_fact.s1; \
40 out.s3 = comm_fact.s3 + comm_fact.s4; \
41 out.s4 = comm_fact.s4 - comm_fact.s3; \
42 out.s5 = comm_fact.s5 + comm_fact.s6; \
43 out.s6 = comm_fact.s5 - comm_fact.s6; \
44 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
45 })
46
47#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
48 ({ \
49 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
50 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
51 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
52 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
53 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
54 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
55 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
56 out.s1 = comm_fact.s0 - comm_fact.s1; \
57 out.s2 = comm_fact.s0 + comm_fact.s1; \
58 out.s3 = comm_fact.s2 - comm_fact.s3; \
59 out.s4 = comm_fact.s2 + comm_fact.s3; \
60 out.s5 = comm_fact.s4 - comm_fact.s5; \
61 out.s6 = comm_fact.s4 + comm_fact.s5; \
62 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
63 })
64
65#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
66
67#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
68//! @cond Doxygen_Suppress
69/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
70 *
71 * @note Data layout supported: NHWC
72 * @note Data type supported: F32/F16
73 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
74 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
75 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
76 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
77 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
78 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
79 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
80 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
81 *
82 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
83 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
84 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
85 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
86 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
87 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
88 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
89 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
90 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
91 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
92 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
93 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
94 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
96 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
98 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
99 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
100 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
101 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
102 */
103//! @endcond
104__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
105 TENSOR4D(src, BUFFER),
106 TENSOR4D(dst, BUFFER))
107{
108 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
109 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
110 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
111
112 // All the tensor dimensions are passed at compile time.
113 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
114#define _ISRC_WIDTH SRC_WIDTH
115#define _ISRC_HEIGHT SRC_HEIGHT
116#define _INUM_TILES_X NUM_TILES_X
117#define _INUM_TILES_Y NUM_TILES_Y
118
119 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
120 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
121 x -= PAD_LEFT;
122 y -= PAD_TOP;
123
124#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
125
126 TILE(DATA_TYPE, 6, 1, in);
127 TILE(DATA_TYPE, 6, 1, out);
128
129 // Initialize the input tile
130 LOOP_UNROLLING(int, i, 0, 1, 6,
131 {
132 in[i].v = 0;
133 })
134
135#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
136 T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
137#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
138 T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
139#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
140
141 TILE(DATA_TYPE, 6, 1, com);
142
143 LOOP_UNROLLING(int, i, 0, 1, 6,
144 {
145 in[i].v *= 4.0f;
146 })
147
148 com[0].v = in[2].v - 4.f * in[0].v;
149 com[1].v = in[3].v - 4.f * in[1].v;
150 com[2].v = in[4].v - 4.f * in[2].v;
151 com[3].v = in[5].v - 4.f * in[3].v;
152 com[4].v = in[3].v - in[1].v;
153 com[4].v = com[4].v + com[4].v;
154 com[5].v = in[4].v - in[2].v;
155
156 out[0].v = com[2].v - com[0].v;
157 out[1].v = com[2].v + com[1].v;
158 out[2].v = com[2].v - com[1].v;
159 out[3].v = com[5].v + com[4].v;
160 out[4].v = com[5].v - com[4].v;
161 out[5].v = com[3].v - com[1].v;
162
163 TILE(uint, 6, 1, dst_indirect_y);
164
165 LOOP_UNROLLING(int, i, 0, 1, 6,
166 {
167 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
168 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
169 })
170
171 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
172
173#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
174
175 TILE(DATA_TYPE, 36, 1, in);
176
177 // Initialize the input tile
178 LOOP_UNROLLING(int, i, 0, 1, 36,
179 {
180 in[i].v = 0;
181 })
182
183 // Load the tile from a NHWC tensor
184 T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
185
186 TILE(DATA_TYPE, 6, 1, com);
187 TILE(DATA_TYPE, 36, 1, tmp);
188
189 LOOP_UNROLLING(int, i, 0, 1, 6,
190 {
191 com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
192 com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
193 com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
194 com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
195 com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
196 com[4].v = com[4].v + com[4].v;
197 com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
198 tmp[i + 0 * 6].v = com[2].v - com[0].v;
199 tmp[i + 1 * 6].v = com[2].v + com[1].v;
200 tmp[i + 2 * 6].v = com[2].v - com[1].v;
201 tmp[i + 3 * 6].v = com[5].v + com[4].v;
202 tmp[i + 4 * 6].v = com[5].v - com[4].v;
203 tmp[i + 5 * 6].v = com[3].v - com[1].v;
204 })
205
206 TILE(DATA_TYPE, 36, 1, out);
207
208 LOOP_UNROLLING(int, i, 0, 1, 6,
209 {
210 com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;
211 com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;
212 com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;
213 com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;
214 com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
215 com[4].v = com[4].v + com[4].v;
216 com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
217 out[i * 6 + 0].v = com[2].v - com[0].v;
218 out[i * 6 + 1].v = com[2].v + com[1].v;
219 out[i * 6 + 2].v = com[2].v - com[1].v;
220 out[i * 6 + 3].v = com[5].v + com[4].v;
221 out[i * 6 + 4].v = com[5].v - com[4].v;
222 out[i * 6 + 5].v = com[3].v - com[1].v;
223 })
224
225 // Compute destination address
226 TILE(uint, 36, 1, dst_indirect_y);
227
228 LOOP_UNROLLING(int, i, 0, 1, 36,
229 {
230 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
231 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
232 })
233
234 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
235#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
236}
237
238//! @cond Doxygen_Suppress
239/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
240 *
241 * @note Data layout supported: NHWC
242 * @note Data type supported: F32/F16
243 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
244 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
245 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
246 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
247 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
248 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
249 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
250 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
251 *
252 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
253 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
254 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
255 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
256 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
257 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
258 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
259 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
260 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
261 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
262 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
263 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
264 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
265 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
266 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
267 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
268 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
269 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
270 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
271 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
272 */
273//! @endcond
274__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
275 TENSOR4D(src, BUFFER),
276 TENSOR4D(dst, BUFFER))
277{
278 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
279 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
280 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
281
282 // All the tensor dimensions are passed at compile time.
283 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
284#define _ISRC_WIDTH SRC_WIDTH
285#define _ISRC_HEIGHT SRC_HEIGHT
286#define _INUM_TILES_X NUM_TILES_X
287#define _INUM_TILES_Y NUM_TILES_Y
288
289 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
290 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
291 x -= PAD_LEFT;
292 y -= PAD_TOP;
293
294#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
295
296 TILE(DATA_TYPE, 8, 1, in);
297 TILE(DATA_TYPE, 8, 1, out);
298
299 // Initialize the input tile
300 LOOP_UNROLLING(int, i, 0, 1, 8,
301 {
302 in[i].v = 0;
303 })
304
305#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
306 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
307#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
308 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
309#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
310
311 TILE(DATA_TYPE, 1, 8, com);
312
313 com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
314 com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
315 com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
316 com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
317 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
318 com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
319 out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
320 out[1].s[0] = com[0].s[0] + com[0].s[1];
321 out[2].s[0] = com[0].s[0] - com[0].s[1];
322 out[3].s[0] = com[0].s[3] + com[0].s[2];
323 out[4].s[0] = com[0].s[3] - com[0].s[2];
324 out[5].s[0] = com[0].s[4] + com[0].s[5];
325 out[6].s[0] = com[0].s[4] - com[0].s[5];
326 out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
327
328 TILE(uint, 8, 1, dst_indirect_y);
329
330 LOOP_UNROLLING(int, i, 0, 1, 8,
331 {
332 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
333 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
334 })
335
336 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
337
338#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
339
340 TILE(DATA_TYPE, 64, 1, in);
341 TILE(DATA_TYPE, 64, 1, out);
342
343 // Initialize the input tile
344 LOOP_UNROLLING(int, i, 0, 1, 64,
345 {
346 in[i].v = 0;
347 })
348
349 // Load the tile from a NHWC tensor
350 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
351
352 TILE(DATA_TYPE, 8, 8, com);
353
354 LOOP_UNROLLING(int, i, 0, 1, 8,
355 {
356 com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
357 com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
358 com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
359 com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
360 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
361 com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
362 com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
363 com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
364 })
365
366 TILE(DATA_TYPE, 8, 8, tmp);
367 tmp[0].v = com[6].v;
368 tmp[1].v = com[0].v + com[1].v;
369 tmp[2].v = com[0].v - com[1].v;
370 tmp[3].v = com[2].v + com[3].v;
371 tmp[4].v = com[2].v - com[3].v;
372 tmp[5].v = com[4].v + com[5].v;
373 tmp[6].v = com[4].v - com[5].v;
374 tmp[7].v = com[7].v;
375
376 LOOP_UNROLLING(int, i, 0, 1, 8,
377 {
378 com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
379 com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
380 com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
381 com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
382 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
383 com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
384 out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
385 out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
386 out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
387 out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
388 out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
389 out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
390 out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
391 out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
392 })
393
394 TILE(uint, 64, 1, dst_indirect_y);
395
396 LOOP_UNROLLING(int, i, 0, 1, 64,
397 {
398 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
399 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
400 })
401
402 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
403
404#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
405}
406
407//! @cond Doxygen_Suppress
408/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
409 *
410 * @note Data layout supported: NHWC
411 * @note Data type supported: F32/F16
412 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
413 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
414 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
415 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
416 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
417 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
418 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
419 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
420 *
421 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
422 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
423 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
424 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
425 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
426 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
427 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
428 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
429 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
430 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
431 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
432 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
433 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
434 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
435 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
436 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
437 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
438 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
439 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
440 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
441 */
442//! @endcond
443__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
444 TENSOR4D(src, BUFFER),
445 TENSOR4D(dst, BUFFER))
446{
447 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
448 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
449 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
450
451 // All the tensor dimensions are passed at compile time.
452 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
453#define _ISRC_WIDTH SRC_WIDTH
454#define _ISRC_HEIGHT SRC_HEIGHT
455#define _INUM_TILES_X NUM_TILES_X
456#define _INUM_TILES_Y NUM_TILES_Y
457
458 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
459 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
460 x -= PAD_LEFT;
461 y -= PAD_TOP;
462
463#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
464
465 TILE(DATA_TYPE, 8, 1, in);
466 TILE(DATA_TYPE, 8, 1, out);
467
468 // Initialize the input tile
469 LOOP_UNROLLING(int, i, 0, 1, 8,
470 {
471 in[i].v = 0;
472 })
473
474#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
475 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
476#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
477 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
478#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
479
480 LOOP_UNROLLING(int, i, 0, 1, 8,
481 {
482 in[i].v *= (DATA_TYPE) - 36.0f;
483 })
484
485 TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
486
487 com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
488 com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
489 com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
490 com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
491 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
492 com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
493 out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
494 out[1].s[0] = com[0].s[0] - com[0].s[1];
495 out[2].s[0] = com[0].s[0] + com[0].s[1];
496 out[3].s[0] = com[0].s[2] - com[0].s[3];
497 out[4].s[0] = com[0].s[2] + com[0].s[3];
498 out[5].s[0] = com[0].s[4] - com[0].s[5];
499 out[6].s[0] = com[0].s[4] + com[0].s[5];
500 out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
501
502 TILE(uint, 8, 1, dst_indirect_y);
503
504 LOOP_UNROLLING(int, i, 0, 1, 8,
505 {
506 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
507 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
508 })
509
510 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
511
512#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
513
514 TILE(DATA_TYPE, 64, 1, in);
515 TILE(DATA_TYPE, 64, 1, out);
516
517 // Initialize the input tile
518 LOOP_UNROLLING(int, i, 0, 1, 64,
519 {
520 in[i].v = 0;
521 })
522
523 // Load the tile from a NHWC tensor
524 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
525
526 TILE(DATA_TYPE, 8, 8, com);
527
528 LOOP_UNROLLING(int, i, 0, 1, 8,
529 {
530 com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
531 com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
532 com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
533 com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
534 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
535 com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
536 com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
537 com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
538 })
539
540 TILE(DATA_TYPE, 8, 8, tmp);
541 tmp[0].v = com[6].v;
542 tmp[1].v = com[0].v - com[1].v;
543 tmp[2].v = com[0].v + com[1].v;
544 tmp[3].v = com[2].v - com[3].v;
545 tmp[4].v = com[2].v + com[3].v;
546 tmp[5].v = com[4].v - com[5].v;
547 tmp[6].v = com[4].v + com[5].v;
548 tmp[7].v = com[7].v;
549
550 LOOP_UNROLLING(int, i, 0, 1, 8,
551 {
552 com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
553 com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
554 com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
555 com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
556 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
557 com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
558 out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
559 out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
560 out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
561 out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
562 out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
563 out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
564 out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
565 out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
566 })
567
568 TILE(uint, 64, 1, dst_indirect_y);
569
570 LOOP_UNROLLING(int, i, 0, 1, 64,
571 {
572 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
573 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
574 })
575
576 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
577
578#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
579}
580
581//! @cond Doxygen_Suppress
582/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
583 *
584 * @note Data layout supported: NHWC
585 * @note Data type supported: F32/F16
586 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
587 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
588 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
589 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
590 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
591 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
592 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
593 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
594 *
595 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
596 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
597 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
598 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
599 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
600 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
601 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
602 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
603 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
604 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
605 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
606 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
607 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
608 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
609 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
610 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
611 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
612 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
613 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
614 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
615 */
616//! @endcond
617__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
618 TENSOR4D(src, BUFFER),
619 TENSOR4D(dst, BUFFER))
620{
621 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
622 src_stride_x,
623 src_step_x,
624 src_stride_y,
625 src_step_y,
626 src_stride_z,
627 src_step_z,
628 src_stride_w,
629 src_step_w,
630 src_offset_first_element_in_bytes,
631 dst_ptr,
632 dst_stride_x,
633 dst_step_x,
634 dst_stride_y,
635 dst_step_y,
636 dst_stride_z,
637 dst_step_z,
638 dst_stride_w,
639 dst_step_w,
640 dst_offset_first_element_in_bytes);
641}
642
643//! @cond Doxygen_Suppress
644/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
645 *
646 * @note Data layout supported: NHWC
647 * @note Data type supported: F32/F16
648 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
649 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
650 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
651 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
652 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
653 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
654 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
655 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
656 *
657 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
658 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
659 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
660 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
661 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
662 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
663 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
664 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
665 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
666 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
667 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
668 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
669 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
670 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
671 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
672 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
673 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
674 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
675 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
676 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
677 */
678//! @endcond
679__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
680 TENSOR4D(src, BUFFER),
681 TENSOR4D(dst, BUFFER))
682{
683 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
684 src_stride_x,
685 src_step_x,
686 src_stride_y,
687 src_step_y,
688 src_stride_z,
689 src_step_z,
690 src_stride_w,
691 src_step_w,
692 src_offset_first_element_in_bytes,
693 dst_ptr,
694 dst_stride_x,
695 dst_step_x,
696 dst_stride_y,
697 dst_step_y,
698 dst_stride_z,
699 dst_step_z,
700 dst_stride_w,
701 dst_step_w,
702 dst_offset_first_element_in_bytes);
703}
704
705//! @cond Doxygen_Suppress
706/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
707 *
708 * @note Data layout supported: NHWC
709 * @note Data type supported: F32/F16
710 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
711 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
712 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
713 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
714 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
715 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
716 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
717 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
718 *
719 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
720 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
721 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
722 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
723 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
724 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
725 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
726 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
727 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
728 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
729 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
730 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
731 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
732 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
733 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
734 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
735 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
736 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
737 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
738 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
739 */
740//! @endcond
741__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
742 TENSOR4D(src, BUFFER),
743 TENSOR4D(dst, BUFFER))
744{
745 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
746 src_stride_x,
747 src_step_x,
748 src_stride_y,
749 src_step_y,
750 src_stride_z,
751 src_step_z,
752 src_stride_w,
753 src_step_w,
754 src_offset_first_element_in_bytes,
755 dst_ptr,
756 dst_stride_x,
757 dst_step_x,
758 dst_stride_y,
759 dst_step_y,
760 dst_stride_z,
761 dst_step_z,
762 dst_stride_w,
763 dst_step_w,
764 dst_offset_first_element_in_bytes);
765}
766
767//! @cond Doxygen_Suppress
768/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
769 *
770 * @note Data layout supported: NHWC
771 * @note Data type supported: F32/F16
772 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
773 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
774 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
775 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
776 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
777 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
778 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
779 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
780 *
781 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
782 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
783 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
784 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
785 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
786 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
787 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
788 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
789 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
790 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
791 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
792 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
793 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
794 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
795 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
796 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
797 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
798 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
799 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
800 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
801 */
802//! @endcond
803__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
804 TENSOR4D(src, BUFFER),
805 TENSOR4D(dst, BUFFER))
806{
807 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
808 src_stride_x,
809 src_step_x,
810 src_stride_y,
811 src_step_y,
812 src_stride_z,
813 src_step_z,
814 src_stride_w,
815 src_step_w,
816 src_offset_first_element_in_bytes,
817 dst_ptr,
818 dst_stride_x,
819 dst_step_x,
820 dst_stride_y,
821 dst_step_y,
822 dst_stride_z,
823 dst_step_z,
824 dst_stride_w,
825 dst_step_w,
826 dst_offset_first_element_in_bytes);
827}
828
829//! @cond Doxygen_Suppress
830/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
831 *
832 * @note Data layout supported: NHWC
833 * @note Data type supported: F32/F16
834 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
835 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
836 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
837 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
838 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
839 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
840 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
841 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
842 *
843 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
844 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
845 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
846 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
847 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
848 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
849 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
850 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
851 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
852 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
853 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
854 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
855 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
856 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
857 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
858 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
859 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
860 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
861 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
862 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
863 */
864//! @endcond
865__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
866 TENSOR4D(src, BUFFER),
867 TENSOR4D(dst, BUFFER))
868{
869 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
870 src_stride_x,
871 src_step_x,
872 src_stride_y,
873 src_step_y,
874 src_stride_z,
875 src_step_z,
876 src_stride_w,
877 src_step_w,
878 src_offset_first_element_in_bytes,
879 dst_ptr,
880 dst_stride_x,
881 dst_step_x,
882 dst_stride_y,
883 dst_step_y,
884 dst_stride_z,
885 dst_step_z,
886 dst_stride_w,
887 dst_step_w,
888 dst_offset_first_element_in_bytes);
889}
890
891//! @cond Doxygen_Suppress
892/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
893 *
894 * @note Data layout supported: NHWC
895 * @note Data type supported: F32/F16
896 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
897 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
898 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
899 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
900 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
901 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
902 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
903 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
904 *
905 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
906 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
907 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
908 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
909 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
910 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
911 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
912 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
913 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
914 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
915 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
916 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
917 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
918 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
919 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
920 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
921 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
922 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
923 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
924 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
925 */
926//! @endcond
927__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
928 TENSOR4D(src, BUFFER),
929 TENSOR4D(dst, BUFFER))
930{
931 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
932 src_stride_x,
933 src_step_x,
934 src_stride_y,
935 src_step_y,
936 src_stride_z,
937 src_step_z,
938 src_stride_w,
939 src_step_w,
940 src_offset_first_element_in_bytes,
941 dst_ptr,
942 dst_stride_x,
943 dst_step_x,
944 dst_stride_y,
945 dst_step_y,
946 dst_stride_z,
947 dst_step_z,
948 dst_stride_w,
949 dst_step_w,
950 dst_offset_first_element_in_bytes);
951}
952#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
953#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)