blob: fbb5e9519697c02ef7dfccc1db439daf93d9201b [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Giorgio Arena049989a2021-03-22 17:02:26 +00002 * Copyright (c) 2018-2021 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Gian Marco Iodice534b8892021-04-01 16:17:16 +010025#include "tile_helpers.h"
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010026
Gian Marco Iodice534b8892021-04-01 16:17:16 +010027#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
Aleksandr Nikolaev2ca5b082021-03-18 14:03:48 +000028 ({ \
Gian Marco Iodice534b8892021-04-01 16:17:16 +010029 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
30 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
31 comm_fact.s2 = 2.5f * tmp.s3; \
32 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
33 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
34 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
35 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
Aleksandr Nikolaev2ca5b082021-03-18 14:03:48 +000036 \
Gian Marco Iodice534b8892021-04-01 16:17:16 +010037 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
Gian Marco Iodiced28b7512018-07-06 12:59:28 +010038 out.s1 = comm_fact.s0 + comm_fact.s1; \
39 out.s2 = comm_fact.s0 - comm_fact.s1; \
40 out.s3 = comm_fact.s3 + comm_fact.s4; \
41 out.s4 = comm_fact.s4 - comm_fact.s3; \
42 out.s5 = comm_fact.s5 + comm_fact.s6; \
43 out.s6 = comm_fact.s5 - comm_fact.s6; \
Gian Marco Iodice534b8892021-04-01 16:17:16 +010044 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
Gian Marco Iodiced28b7512018-07-06 12:59:28 +010045 })
46
Michele Di Giorgiof955d512019-02-27 14:26:51 +000047#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
48 ({ \
49 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
50 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
51 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
52 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
53 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
54 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
55 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
56 out.s1 = comm_fact.s0 - comm_fact.s1; \
57 out.s2 = comm_fact.s0 + comm_fact.s1; \
58 out.s3 = comm_fact.s2 - comm_fact.s3; \
59 out.s4 = comm_fact.s2 + comm_fact.s3; \
60 out.s5 = comm_fact.s4 - comm_fact.s5; \
61 out.s6 = comm_fact.s4 + comm_fact.s5; \
62 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
63 })
64
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010065#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
66/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
67 *
68 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
69 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
70 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
71 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
72 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
73 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010074 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010075 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +010076 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010077 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
78 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
79 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
80 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
81 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
82 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
83 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
84 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
85 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
86 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
87 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
88 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
89 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
90 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
91 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +010092 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
93 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +010094 */
95__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
96 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +010097 TENSOR3D_DECLARATION(dst),
98 uint src_stride_w,
99 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100100{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100101 const int x = get_global_id(0);
102 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000103#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100104 const int z = get_global_id(2) % SRC_DEPTH;
105 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000106#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000107 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000108#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100109
110 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000111#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100112 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000113#else /* defined(SRC_DEPTH) */
114 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
115#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100116
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100117 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100118
119#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100120 VEC_DATA_TYPE(DATA_TYPE, 4)
121 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100122#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100123 VEC_DATA_TYPE(DATA_TYPE, 4)
124 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
125 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
126 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
127 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100128#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100129 VEC_DATA_TYPE(DATA_TYPE, 4)
130 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
131 VEC_DATA_TYPE(DATA_TYPE, 4)
132 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
133 VEC_DATA_TYPE(DATA_TYPE, 4)
134 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
135 VEC_DATA_TYPE(DATA_TYPE, 4)
136 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100137#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
138
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100139 VEC_DATA_TYPE(DATA_TYPE, 4)
140 tmp0 = in_row0;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100141
142#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
143 tmp0 -= in_row2;
144#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
145
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100146 DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
147 DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
148 DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
149 DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100150
151#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100152 VEC_DATA_TYPE(DATA_TYPE, 4)
153 tmp1 = in_row1 + in_row2;
154 VEC_DATA_TYPE(DATA_TYPE, 4)
155 tmp2 = in_row2 - in_row1;
156 VEC_DATA_TYPE(DATA_TYPE, 4)
157 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100158
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100159 DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
160 DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
161 DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
162 DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100163
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100164 DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
165 DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
166 DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
167 DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100168
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100169 DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
170 DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
171 DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
172 DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100173#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
174
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000175#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100176 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000177#else /* defined(SRC_DEPTH) */
178 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
179#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100180
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100181 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
182 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
183 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
184 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100185
186#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100187 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
188 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
189 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
190 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
191 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
192 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
193 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
194 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
195 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
196 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
197 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
198 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100199#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
200}
201
202/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
203 *
204 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
205 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
206 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
207 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
208 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
209 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100210 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100211 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100212 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100213 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
214 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
215 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
216 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
217 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
218 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
219 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
220 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
221 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
222 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
223 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
224 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
225 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
226 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
227 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100228 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
229 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100230 */
231__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
232 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100233 TENSOR3D_DECLARATION(dst),
234 uint src_stride_w,
235 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100236{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100237 const int x = get_global_id(0);
238 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000239#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100240 const int z = (get_global_id(2) * 2) % SRC_DEPTH;
241 const int b = (get_global_id(2) * 2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000242#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000243 const int z = get_global_id(2) * 2;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000244#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100245
246 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000247#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100248 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000249#else /* defined(SRC_DEPTH) */
250 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
251#endif /* defined(SRC_DEPTH) */
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100252 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100253
254#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100255 VEC_DATA_TYPE(DATA_TYPE, 4)
256 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100257#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100258 VEC_DATA_TYPE(DATA_TYPE, 4)
259 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
260 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
261 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
262 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100263#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100264 VEC_DATA_TYPE(DATA_TYPE, 4)
265 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
266 VEC_DATA_TYPE(DATA_TYPE, 4)
267 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
268 VEC_DATA_TYPE(DATA_TYPE, 4)
269 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
270 VEC_DATA_TYPE(DATA_TYPE, 4)
271 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100272#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
273
274 src_addr += src_stride_z;
275#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100276 VEC_DATA_TYPE(DATA_TYPE, 4)
277 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100278#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100279 VEC_DATA_TYPE(DATA_TYPE, 4)
280 in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
281 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
282 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
283 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100284#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100285 VEC_DATA_TYPE(DATA_TYPE, 4)
286 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
287 VEC_DATA_TYPE(DATA_TYPE, 4)
288 in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
289 VEC_DATA_TYPE(DATA_TYPE, 4)
290 in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
291 VEC_DATA_TYPE(DATA_TYPE, 4)
292 in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100293#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
294
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100295 VEC_DATA_TYPE(DATA_TYPE, 4)
296 tmp0 = in_row0;
297 VEC_DATA_TYPE(DATA_TYPE, 4)
298 tmp4 = in_row4;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100299
300#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
301 tmp0 -= in_row2;
302 tmp4 -= in_row6;
303#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
304
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100305 VEC_DATA_TYPE(DATA_TYPE, 2)
306 out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
307 VEC_DATA_TYPE(DATA_TYPE, 2)
308 out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
309 VEC_DATA_TYPE(DATA_TYPE, 2)
310 out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
311 VEC_DATA_TYPE(DATA_TYPE, 2)
312 out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100313
314#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100315 VEC_DATA_TYPE(DATA_TYPE, 4)
316 tmp1 = in_row1 + in_row2;
317 VEC_DATA_TYPE(DATA_TYPE, 4)
318 tmp2 = in_row2 - in_row1;
319 VEC_DATA_TYPE(DATA_TYPE, 4)
320 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100321
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100322 VEC_DATA_TYPE(DATA_TYPE, 4)
323 tmp5 = in_row5 + in_row6;
324 VEC_DATA_TYPE(DATA_TYPE, 4)
325 tmp6 = in_row6 - in_row5;
326 VEC_DATA_TYPE(DATA_TYPE, 4)
327 tmp7 = in_row5 - in_row7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100328
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100329 VEC_DATA_TYPE(DATA_TYPE, 2)
330 out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
331 VEC_DATA_TYPE(DATA_TYPE, 2)
332 out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
333 VEC_DATA_TYPE(DATA_TYPE, 2)
334 out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
335 VEC_DATA_TYPE(DATA_TYPE, 2)
336 out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100337
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100338 VEC_DATA_TYPE(DATA_TYPE, 2)
339 out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
340 VEC_DATA_TYPE(DATA_TYPE, 2)
341 out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
342 VEC_DATA_TYPE(DATA_TYPE, 2)
343 out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
344 VEC_DATA_TYPE(DATA_TYPE, 2)
345 out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100346
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100347 VEC_DATA_TYPE(DATA_TYPE, 2)
348 out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
349 VEC_DATA_TYPE(DATA_TYPE, 2)
350 out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
351 VEC_DATA_TYPE(DATA_TYPE, 2)
352 out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
353 VEC_DATA_TYPE(DATA_TYPE, 2)
354 out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100355#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
356
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000357#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100358 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000359#else /* defined(SRC_DEPTH) */
360 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
361#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100362
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100363 vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
364 vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
365 vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
366 vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100367
368#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100369 vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
370 vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
371 vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
372 vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
373 vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
374 vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
375 vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
376 vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
377 vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
378 vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
379 vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
380 vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100381#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
382}
383
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100384/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100385 *
386 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
387 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
388 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
389 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
390 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
391 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100392 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100393 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100394 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100395 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
396 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
397 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
398 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
399 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
400 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
401 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
402 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
403 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
404 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
405 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
406 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
407 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
408 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
409 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100410 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
411 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100412 */
413__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
414 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100415 TENSOR3D_DECLARATION(dst),
416 uint src_stride_w,
417 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100418{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100419 const int x = get_global_id(0);
420 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000421#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100422 const int z = get_global_id(2) % SRC_DEPTH;
423 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000424#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000425 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000426#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100427
428 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000429#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100430 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000431#else /* defined(SRC_DEPTH) */
432 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
433#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100434
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100435 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100436
437#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
438 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100439 VEC_DATA_TYPE(DATA_TYPE, 4)
440 d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
441 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
442 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
443 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
444 VEC_DATA_TYPE(DATA_TYPE, 2)
445 d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
446 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100447#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
448 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100449 VEC_DATA_TYPE(DATA_TYPE, 4)
450 d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
451 VEC_DATA_TYPE(DATA_TYPE, 2)
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000452 d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100453#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
454
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100455 DATA_TYPE out0 = 0.0f;
456 DATA_TYPE out1 = 0.0f;
457 DATA_TYPE out2 = 0.0f;
458 DATA_TYPE out3 = 0.0f;
459 DATA_TYPE out4 = 0.0f;
460 DATA_TYPE out5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100461
462 // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
463 out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
464 out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
465 out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
466 out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
467 out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
468 out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
469
470#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
471 // Row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100472 VEC_DATA_TYPE(DATA_TYPE, 4)
473 d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
474 VEC_DATA_TYPE(DATA_TYPE, 2)
475 d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100476
477 // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100478 DATA_TYPE k0 = d41.s0;
479 DATA_TYPE k1 = d41.s0;
480 DATA_TYPE k2 = d41.s0;
481 DATA_TYPE k3 = d41.s0;
482 DATA_TYPE k4 = d41.s0;
483 DATA_TYPE k5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100484
485 k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
486 k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
487 k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
488 k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
489 k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
490 k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
491
492 out0 += k0;
493 out1 += k1;
494 out2 += k2;
495 out3 += k3;
496 out4 += k4;
497 out5 += k5;
498
499 // Row2
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100500 VEC_DATA_TYPE(DATA_TYPE, 4)
501 d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
502 VEC_DATA_TYPE(DATA_TYPE, 2)
503 d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
505 out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
506 out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
507 out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
508 out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
509 out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
510 out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
511#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
512
513 // Compute destination address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000514#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100515 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000516#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000517 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000518#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100519
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100520 uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100521
522 *(dst_addr) = out0;
523 dst_addr += dst_plane_stride;
524 *(dst_addr) = out1;
525 dst_addr += dst_plane_stride;
526 *(dst_addr) = out2;
527 dst_addr += dst_plane_stride;
528 *(dst_addr) = out3;
529 dst_addr += dst_plane_stride;
530 *(dst_addr) = out4;
531 dst_addr += dst_plane_stride;
532 *(dst_addr) = out5;
533 dst_addr += dst_plane_stride;
534
535#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100536 DATA_TYPE out6 = k0;
537 DATA_TYPE out7 = k1;
538 DATA_TYPE out8 = k2;
539 DATA_TYPE out9 = k3;
540 DATA_TYPE out10 = k4;
541 DATA_TYPE out11 = k5;
542 DATA_TYPE out12 = k0;
543 DATA_TYPE out13 = k1;
544 DATA_TYPE out14 = k2;
545 DATA_TYPE out15 = k3;
546 DATA_TYPE out16 = k4;
547 DATA_TYPE out17 = k5;
548 DATA_TYPE out18 = k0;
549 DATA_TYPE out19 = k1;
550 DATA_TYPE out20 = k2;
551 DATA_TYPE out21 = k3;
552 DATA_TYPE out22 = k4;
553 DATA_TYPE out23 = k5;
554 DATA_TYPE out24 = k0;
555 DATA_TYPE out25 = k1;
556 DATA_TYPE out26 = k2;
557 DATA_TYPE out27 = k3;
558 DATA_TYPE out28 = k4;
559 DATA_TYPE out29 = k5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100560
561 // Row1
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100562 VEC_DATA_TYPE(DATA_TYPE, 4)
563 d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
564 VEC_DATA_TYPE(DATA_TYPE, 2)
565 d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100566
567 // Row3
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100568 VEC_DATA_TYPE(DATA_TYPE, 4)
569 d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
570 VEC_DATA_TYPE(DATA_TYPE, 2)
571 d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100572
573 // Compute common parts for the channels between [6, 29]
574 // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
575 // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100576 DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
577 DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
578 DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
579 DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
580 DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
581 DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
582 DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
583 DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
584 DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
585 DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
586 DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
587 DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100588
589 // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
590 // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100591 DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
592 DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
593 DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
594 DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
595 DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
596 DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
597 DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
598 DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
599 DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
600 DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
601 DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
602 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100603
604 out6 += part0 - part1;
605 out12 += part0 + part1;
606 out7 += part2 + part3 + part4 + part5;
607 out8 += part2 - part3 + part4 - part5;
608 out13 += part2 + part3 - part4 - part5;
609 out14 += part2 - part3 - part4 + part5;
610 out9 += part6 + part7 + part8 + part9;
611 out10 += part6 - part7 + part8 - part9;
612 out15 += part6 - part7 - part8 + part9;
613 out16 += part6 + part7 - part8 - part9;
614 out11 += part10 + part11;
615 out17 += part10 - part11;
616
617 out18 += part13 - part12;
618 out24 += part13 + part12;
619 out19 += part14 + part15 + part16 + part17;
620 out20 += part14 - part15 + part16 - part17;
621 out25 += part14 - part15 - part16 + part17;
622 out26 += part14 + part15 - part16 - part17;
623 out21 += part18 + part19 + part20 + part21;
624 out22 += part18 - part19 + part20 - part21;
625 out27 += part18 - part19 - part20 + part21;
626 out28 += part18 + part19 - part20 - part21;
627 out23 += part22 + part23;
628 out29 += part22 - part23;
629
630 *(dst_addr) = out6;
631 dst_addr += dst_plane_stride;
632 *(dst_addr) = out7;
633 dst_addr += dst_plane_stride;
634 *(dst_addr) = out8;
635 dst_addr += dst_plane_stride;
636 *(dst_addr) = out9;
637 dst_addr += dst_plane_stride;
638 *(dst_addr) = out10;
639 dst_addr += dst_plane_stride;
640 *(dst_addr) = out11;
641 dst_addr += dst_plane_stride;
642 *(dst_addr) = out12;
643 dst_addr += dst_plane_stride;
644 *(dst_addr) = out13;
645 dst_addr += dst_plane_stride;
646 *(dst_addr) = out14;
647 dst_addr += dst_plane_stride;
648 *(dst_addr) = out15;
649 dst_addr += dst_plane_stride;
650 *(dst_addr) = out16;
651 dst_addr += dst_plane_stride;
652 *(dst_addr) = out17;
653 dst_addr += dst_plane_stride;
654
655 *(dst_addr) = out18;
656 dst_addr += dst_plane_stride;
657 *(dst_addr) = out19;
658 dst_addr += dst_plane_stride;
659 *(dst_addr) = out20;
660 dst_addr += dst_plane_stride;
661 *(dst_addr) = out21;
662 dst_addr += dst_plane_stride;
663 *(dst_addr) = out22;
664 dst_addr += dst_plane_stride;
665 *(dst_addr) = out23;
666 dst_addr += dst_plane_stride;
667 *(dst_addr) = out24;
668 dst_addr += dst_plane_stride;
669 *(dst_addr) = out25;
670 dst_addr += dst_plane_stride;
671 *(dst_addr) = out26;
672 dst_addr += dst_plane_stride;
673 *(dst_addr) = out27;
674 dst_addr += dst_plane_stride;
675 *(dst_addr) = out28;
676 dst_addr += dst_plane_stride;
677 *(dst_addr) = out29;
678 dst_addr += dst_plane_stride;
679
680 // Row5
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100681 VEC_DATA_TYPE(DATA_TYPE, 4)
682 d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
683 VEC_DATA_TYPE(DATA_TYPE, 2)
684 d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100685
686 // Channels [30, 35]
687 out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
688 out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
689 out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
690 out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
691 out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
692 out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
693
694 *(dst_addr) = out0;
695 dst_addr += dst_plane_stride;
696 *(dst_addr) = out1;
697 dst_addr += dst_plane_stride;
698 *(dst_addr) = out2;
699 dst_addr += dst_plane_stride;
700 *(dst_addr) = out3;
701 dst_addr += dst_plane_stride;
702 *(dst_addr) = out4;
703 dst_addr += dst_plane_stride;
704 *(dst_addr) = out5;
705 dst_addr += dst_plane_stride;
706#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
707}
708
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100709/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
710 *
711 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
712 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
713 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
714 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
715 * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
716 * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
717 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
718 *
719 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
720 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
721 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
722 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
723 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
724 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
725 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
726 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
727 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
728 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
729 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
730 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
731 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
732 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
733 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
734 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
735 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
736 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
737 */
738__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
739 TENSOR3D_DECLARATION(src),
740 TENSOR3D_DECLARATION(dst),
741 uint src_stride_w,
742 uint dst_stride_w)
743{
744 const int x = get_global_id(0);
745 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000746#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100747 const int z = get_global_id(2) % SRC_DEPTH;
748 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000749#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000750 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000751#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100752
753 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000754#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100755 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000756#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000757 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000758#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100759 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
760
761 // Load input tile
762#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
763 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
764#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
765 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
766 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
767 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
768 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
769 *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
770 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
771 *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
772 *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
773#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
774 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
775 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
776 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
777 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
778 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
779 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
780 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
781 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
782#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
783
784 // Calculate common factors for intermediate tensor
785 VEC_DATA_TYPE(DATA_TYPE, 8)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100786 tmp0 = in_row0;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100787 VEC_DATA_TYPE(DATA_TYPE, 8)
788 comm_fact0 = 0.0f;
789
790#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena049989a2021-03-22 17:02:26 +0000791 comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100792 tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100793
794 VEC_DATA_TYPE(DATA_TYPE, 8)
Giorgio Arena049989a2021-03-22 17:02:26 +0000795 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100796 VEC_DATA_TYPE(DATA_TYPE, 8)
Giorgio Arena049989a2021-03-22 17:02:26 +0000797 comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100798
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100799 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
800 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100801
Giorgio Arena049989a2021-03-22 17:02:26 +0000802 comm_fact0 = (DATA_TYPE)2.5f * in_row3;
803 comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100804
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100805 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
806 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100807
Giorgio Arena049989a2021-03-22 17:02:26 +0000808 comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
809 comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100810
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100811 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
812 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
813 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100814#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
815
816 // Calculate output rows (reuse comm_fact0 vector)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100817 VEC_DATA_TYPE(DATA_TYPE, 8)
818 out0;
819
820 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100821
822#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100823 VEC_DATA_TYPE(DATA_TYPE, 8)
824 out1, out2, out3, out4, out5, out6, out7;
825
826 OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
827 OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
828 OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
829 OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
830 OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
831 OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
832 OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100833#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
834
835 // Store values across the channels
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000836#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100837 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000838#else /* defined(SRC_DEPTH) */
839 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
840#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100841
842 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
843 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
844 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
845 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
846 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
847 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
848 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
849 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
850
851#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
852 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
853 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
854 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
855 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
856 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
857 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
858 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
859 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
860 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
861 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
862 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
863 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
864 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
865 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
866 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
867 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
868 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
869 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
870 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
871 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
872 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
873 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
874 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
875 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
876 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
877 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
878 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
879 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
880 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
881 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
882 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
883 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
884 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
885 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
886 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
887 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
888 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
889 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
890 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
891 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
892 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
893 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
894 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
895 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
896 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
897 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
898 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
899 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
900 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
901 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
902 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
903 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
904 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
905 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
906 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
907 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
908#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
909}
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100910
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100911#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
912//! @cond Doxygen_Suppress
Giorgio Arena149fdf32018-07-04 17:03:33 +0100913/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100914 *
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100915 * @note Data layout supported: NHWC
916 * @note Data type supported: F32/F16
917 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
918 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
919 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
920 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100921 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
922 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
923 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
924 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100925 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100926 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100927 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
928 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
929 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
930 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
931 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
932 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100933 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
934 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
935 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100936 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
937 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
938 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
939 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
940 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
941 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100942 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100943 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100944 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
945 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100946 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +0100947//! @endcond
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100948__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100949 TENSOR4D(src, BUFFER),
950 TENSOR4D(dst, BUFFER))
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100951{
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100952 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
953 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
954 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100955
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100956 // All the tensor dimensions are passed at compile time.
957 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
958#define _ISRC_WIDTH SRC_WIDTH
959#define _ISRC_HEIGHT SRC_HEIGHT
960#define _INUM_TILES_X NUM_TILES_X
961#define _INUM_TILES_Y NUM_TILES_Y
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100962
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100963 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
964 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
965 x -= PAD_LEFT;
966 y -= PAD_TOP;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100967
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100968#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100969
Gian Marco Iodice598e3a82021-04-13 15:53:20 +0100970 TILE(DATA_TYPE, 6, 1, in);
971 TILE(DATA_TYPE, 6, 1, out);
972
973 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +0100974 LOOP_UNROLLING(int, i, 0, 1, 6,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +0100975 {
976 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +0100977 })
Giorgio Arena149fdf32018-07-04 17:03:33 +0100978
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100979#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
980 T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
981#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
982 T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
983#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100984
Gian Marco Iodice598e3a82021-04-13 15:53:20 +0100985 TILE(DATA_TYPE, 6, 1, com);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100986
Giorgio Arenabdd16d12021-05-13 16:58:51 +0100987 LOOP_UNROLLING(int, i, 0, 1, 6,
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100988 {
989 in[i].v *= 4.0f;
Giorgio Arenabdd16d12021-05-13 16:58:51 +0100990 })
Giorgio Arena149fdf32018-07-04 17:03:33 +0100991
Gian Marco Iodice534b8892021-04-01 16:17:16 +0100992 com[0].v = in[2].v - 4.f * in[0].v;
993 com[1].v = in[3].v - 4.f * in[1].v;
994 com[2].v = in[4].v - 4.f * in[2].v;
995 com[3].v = in[5].v - 4.f * in[3].v;
996 com[4].v = in[3].v - in[1].v;
997 com[4].v = com[4].v + com[4].v;
998 com[5].v = in[4].v - in[2].v;
Giorgio Arena049989a2021-03-22 17:02:26 +0000999
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001000 out[0].v = com[2].v - com[0].v;
1001 out[1].v = com[2].v + com[1].v;
1002 out[2].v = com[2].v - com[1].v;
1003 out[3].v = com[5].v + com[4].v;
1004 out[4].v = com[5].v - com[4].v;
1005 out[5].v = com[3].v - com[1].v;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001006
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001007 TILE(uint, 6, 1, dst_indirect_y);
Giorgio Arena049989a2021-03-22 17:02:26 +00001008
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001009 LOOP_UNROLLING(int, i, 0, 1, 6,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001010 {
1011 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1012 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001013 })
Giorgio Arena049989a2021-03-22 17:02:26 +00001014
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001015 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Giorgio Arena049989a2021-03-22 17:02:26 +00001016
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001017#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001018
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001019 TILE(DATA_TYPE, 36, 1, in);
1020
1021 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001022 LOOP_UNROLLING(int, i, 0, 1, 36,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001023 {
1024 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001025 })
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001026
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001027 // Load the tile from a NHWC tensor
1028 T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Giorgio Arena049989a2021-03-22 17:02:26 +00001029
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001030 TILE(DATA_TYPE, 6, 1, com);
1031 TILE(DATA_TYPE, 36, 1, tmp);
Giorgio Arena049989a2021-03-22 17:02:26 +00001032
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001033 LOOP_UNROLLING(int, i, 0, 1, 6,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001034 {
1035 com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
1036 com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
1037 com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
1038 com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
1039 com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
1040 com[4].v = com[4].v + com[4].v;
1041 com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
1042 tmp[i + 0 * 6].v = com[2].v - com[0].v;
1043 tmp[i + 1 * 6].v = com[2].v + com[1].v;
1044 tmp[i + 2 * 6].v = com[2].v - com[1].v;
1045 tmp[i + 3 * 6].v = com[5].v + com[4].v;
1046 tmp[i + 4 * 6].v = com[5].v - com[4].v;
1047 tmp[i + 5 * 6].v = com[3].v - com[1].v;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001048 })
Giorgio Arena049989a2021-03-22 17:02:26 +00001049
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001050 TILE(DATA_TYPE, 36, 1, out);
Giorgio Arena049989a2021-03-22 17:02:26 +00001051
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001052 LOOP_UNROLLING(int, i, 0, 1, 6,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001053 {
1054 com[0].v = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;
1055 com[1].v = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;
1056 com[2].v = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;
1057 com[3].v = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;
1058 com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
1059 com[4].v = com[4].v + com[4].v;
1060 com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
1061 out[i * 6 + 0].v = com[2].v - com[0].v;
1062 out[i * 6 + 1].v = com[2].v + com[1].v;
1063 out[i * 6 + 2].v = com[2].v - com[1].v;
1064 out[i * 6 + 3].v = com[5].v + com[4].v;
1065 out[i * 6 + 4].v = com[5].v - com[4].v;
1066 out[i * 6 + 5].v = com[3].v - com[1].v;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001067 })
Giorgio Arena149fdf32018-07-04 17:03:33 +01001068
1069 // Compute destination address
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001070 TILE(uint, 36, 1, dst_indirect_y);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001071
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001072 LOOP_UNROLLING(int, i, 0, 1, 36,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001073 {
1074 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1075 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001076 })
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001077
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001078 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
1079#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001080}
1081
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001082//! @cond Doxygen_Suppress
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001083/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
Giorgio Arena149fdf32018-07-04 17:03:33 +01001084 *
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001085 * @note Data layout supported: NHWC
1086 * @note Data type supported: F32/F16
1087 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1088 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1089 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1090 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001091 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
Giorgio Arena149fdf32018-07-04 17:03:33 +01001092 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001093 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1094 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Giorgio Arena149fdf32018-07-04 17:03:33 +01001095 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001096 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arena149fdf32018-07-04 17:03:33 +01001097 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1098 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1099 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1100 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1101 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1102 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001103 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1104 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1105 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001106 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1107 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1108 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1109 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1110 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1111 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001112 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001113 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001114 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1115 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Giorgio Arena149fdf32018-07-04 17:03:33 +01001116 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001117//! @endcond
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001118__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001119 TENSOR4D(src, BUFFER),
1120 TENSOR4D(dst, BUFFER))
Giorgio Arena149fdf32018-07-04 17:03:33 +01001121{
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001122 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
1123 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
1124 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001125
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001126 // All the tensor dimensions are passed at compile time.
1127 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
1128#define _ISRC_WIDTH SRC_WIDTH
1129#define _ISRC_HEIGHT SRC_HEIGHT
1130#define _INUM_TILES_X NUM_TILES_X
1131#define _INUM_TILES_Y NUM_TILES_Y
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001132
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001133 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
1134 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
1135 x -= PAD_LEFT;
1136 y -= PAD_TOP;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001137
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001138#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001139
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001140 TILE(DATA_TYPE, 8, 1, in);
1141 TILE(DATA_TYPE, 8, 1, out);
1142
1143 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001144 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001145 {
1146 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001147 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001148
1149#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001150 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
1151#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1152 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
1153#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001154
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001155 TILE(DATA_TYPE, 1, 8, com);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001156
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001157 com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
1158 com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
1159 com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
1160 com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
1161 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
1162 com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
1163 out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
1164 out[1].s[0] = com[0].s[0] + com[0].s[1];
1165 out[2].s[0] = com[0].s[0] - com[0].s[1];
1166 out[3].s[0] = com[0].s[3] + com[0].s[2];
1167 out[4].s[0] = com[0].s[3] - com[0].s[2];
1168 out[5].s[0] = com[0].s[4] + com[0].s[5];
1169 out[6].s[0] = com[0].s[4] - com[0].s[5];
1170 out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001171
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001172 TILE(uint, 8, 1, dst_indirect_y);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001173
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001174 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001175 {
1176 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1177 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001178 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001179
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001180 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001181
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001182#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001183
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001184 TILE(DATA_TYPE, 64, 1, in);
1185 TILE(DATA_TYPE, 64, 1, out);
1186
1187 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001188 LOOP_UNROLLING(int, i, 0, 1, 64,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001189 {
1190 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001191 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001192
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001193 // Load the tile from a NHWC tensor
1194 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001195
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001196 TILE(DATA_TYPE, 8, 8, com);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001197
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001198 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001199 {
1200 com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
1201 com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
1202 com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
1203 com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
1204 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
1205 com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
1206 com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
1207 com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001208 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001209
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001210 TILE(DATA_TYPE, 8, 8, tmp);
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001211 tmp[0].v = com[6].v;
1212 tmp[1].v = com[0].v + com[1].v;
1213 tmp[2].v = com[0].v - com[1].v;
1214 tmp[3].v = com[2].v + com[3].v;
1215 tmp[4].v = com[2].v - com[3].v;
1216 tmp[5].v = com[4].v + com[5].v;
1217 tmp[6].v = com[4].v - com[5].v;
1218 tmp[7].v = com[7].v;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001219
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001220 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001221 {
1222 com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
1223 com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
1224 com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
1225 com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
1226 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
1227 com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
1228 out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
1229 out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
1230 out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
1231 out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
1232 out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
1233 out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
1234 out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
1235 out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001236 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001237
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001238 TILE(uint, 64, 1, dst_indirect_y);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001239
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001240 LOOP_UNROLLING(int, i, 0, 1, 64,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001241 {
1242 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1243 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001244 })
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001245
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001246 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001247
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001248#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1249}
1250
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001251//! @cond Doxygen_Suppress
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001252/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
1253 *
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001254 * @note Data layout supported: NHWC
1255 * @note Data type supported: F32/F16
1256 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1257 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1258 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1259 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1260 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1261 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1262 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1263 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001264 *
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001265 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001266 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1267 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1268 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1269 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1270 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1271 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001272 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1273 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1274 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001275 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1276 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1277 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1278 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1279 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1280 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001281 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001282 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001283 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001285 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001286//! @endcond
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001287__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001288 TENSOR4D(src, BUFFER),
1289 TENSOR4D(dst, BUFFER))
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001290{
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001291 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
1292 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
1293 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001294
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001295 // All the tensor dimensions are passed at compile time.
1296 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
1297#define _ISRC_WIDTH SRC_WIDTH
1298#define _ISRC_HEIGHT SRC_HEIGHT
1299#define _INUM_TILES_X NUM_TILES_X
1300#define _INUM_TILES_Y NUM_TILES_Y
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001301
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001302 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
1303 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
1304 x -= PAD_LEFT;
1305 y -= PAD_TOP;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001306
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001307#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001308
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001309 TILE(DATA_TYPE, 8, 1, in);
1310 TILE(DATA_TYPE, 8, 1, out);
1311
1312 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001313 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001314 {
1315 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001316 })
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001317
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001318#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001319 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
1320#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1321 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
1322#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001323
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001324 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001325 {
1326 in[i].v *= (DATA_TYPE) - 36.0f;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001327 })
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001328
Sheri Zhang6dbcc0e2021-04-12 10:53:57 +01001329 TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001330
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001331 com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
1332 com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
1333 com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
1334 com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
1335 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
1336 com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
1337 out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
1338 out[1].s[0] = com[0].s[0] - com[0].s[1];
1339 out[2].s[0] = com[0].s[0] + com[0].s[1];
1340 out[3].s[0] = com[0].s[2] - com[0].s[3];
1341 out[4].s[0] = com[0].s[2] + com[0].s[3];
1342 out[5].s[0] = com[0].s[4] - com[0].s[5];
1343 out[6].s[0] = com[0].s[4] + com[0].s[5];
1344 out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001345
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001346 TILE(uint, 8, 1, dst_indirect_y);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001347
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001348 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001349 {
1350 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1351 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001352 })
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001353
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001354 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001355
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001356#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001357
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001358 TILE(DATA_TYPE, 64, 1, in);
1359 TILE(DATA_TYPE, 64, 1, out);
1360
1361 // Initialize the input tile
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001362 LOOP_UNROLLING(int, i, 0, 1, 64,
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001363 {
1364 in[i].v = 0;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001365 })
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001366
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001367 // Load the tile from a NHWC tensor
1368 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001369
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001370 TILE(DATA_TYPE, 8, 8, com);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001371
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001372 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001373 {
1374 com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
1375 com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
1376 com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
1377 com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
1378 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
1379 com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
1380 com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
1381 com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001382 })
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001383
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001384 TILE(DATA_TYPE, 8, 8, tmp);
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001385 tmp[0].v = com[6].v;
1386 tmp[1].v = com[0].v - com[1].v;
1387 tmp[2].v = com[0].v + com[1].v;
1388 tmp[3].v = com[2].v - com[3].v;
1389 tmp[4].v = com[2].v + com[3].v;
1390 tmp[5].v = com[4].v - com[5].v;
1391 tmp[6].v = com[4].v + com[5].v;
1392 tmp[7].v = com[7].v;
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001393
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001394 LOOP_UNROLLING(int, i, 0, 1, 8,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001395 {
1396 com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
1397 com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
1398 com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
1399 com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
1400 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
1401 com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
1402 out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
1403 out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
1404 out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
1405 out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
1406 out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
1407 out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
1408 out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
1409 out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001410 })
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001411
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001412 TILE(uint, 64, 1, dst_indirect_y);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001413
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001414 LOOP_UNROLLING(int, i, 0, 1, 64,
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001415 {
1416 dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
1417 dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
Giorgio Arenabdd16d12021-05-13 16:58:51 +01001418 })
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001419
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001420 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001421
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001422#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001423}
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001424
1425//! @cond Doxygen_Suppress
1426/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
1427 *
1428 * @note Data layout supported: NHWC
1429 * @note Data type supported: F32/F16
1430 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1431 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1432 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1433 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1434 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1435 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1436 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1437 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1438 *
1439 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1440 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1441 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1442 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1443 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1444 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1445 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1446 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1447 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1448 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1449 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1450 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1451 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1452 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1453 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1454 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1455 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1456 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1457 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1458 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1459 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001460//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001461__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
1462 TENSOR4D(src, BUFFER),
1463 TENSOR4D(dst, BUFFER))
1464{
1465 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
1466 src_stride_x,
1467 src_step_x,
1468 src_stride_y,
1469 src_step_y,
1470 src_stride_z,
1471 src_step_z,
1472 src_stride_w,
1473 src_step_w,
1474 src_offset_first_element_in_bytes,
1475 dst_ptr,
1476 dst_stride_x,
1477 dst_step_x,
1478 dst_stride_y,
1479 dst_step_y,
1480 dst_stride_z,
1481 dst_step_z,
1482 dst_stride_w,
1483 dst_step_w,
1484 dst_offset_first_element_in_bytes);
1485}
1486
1487//! @cond Doxygen_Suppress
1488/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
1489 *
1490 * @note Data layout supported: NHWC
1491 * @note Data type supported: F32/F16
1492 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1493 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1494 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1495 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1496 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1497 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1498 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1499 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1500 *
1501 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1502 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1503 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1504 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1505 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1506 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1507 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1508 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1509 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1510 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1511 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1512 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1513 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1514 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1515 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1516 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1517 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1518 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1519 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1520 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1521 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001522//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001523__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
1524 TENSOR4D(src, BUFFER),
1525 TENSOR4D(dst, BUFFER))
1526{
1527 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
1528 src_stride_x,
1529 src_step_x,
1530 src_stride_y,
1531 src_step_y,
1532 src_stride_z,
1533 src_step_z,
1534 src_stride_w,
1535 src_step_w,
1536 src_offset_first_element_in_bytes,
1537 dst_ptr,
1538 dst_stride_x,
1539 dst_step_x,
1540 dst_stride_y,
1541 dst_step_y,
1542 dst_stride_z,
1543 dst_step_z,
1544 dst_stride_w,
1545 dst_step_w,
1546 dst_offset_first_element_in_bytes);
1547}
1548
1549//! @cond Doxygen_Suppress
1550/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
1551 *
1552 * @note Data layout supported: NHWC
1553 * @note Data type supported: F32/F16
1554 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1555 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1556 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1557 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1558 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1559 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1560 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1561 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1562 *
1563 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1564 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1565 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1566 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1567 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1569 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1570 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1571 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1572 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1573 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1574 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1575 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1576 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1577 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1578 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1579 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1580 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1581 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1582 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1583 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001584//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001585__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
1586 TENSOR4D(src, BUFFER),
1587 TENSOR4D(dst, BUFFER))
1588{
1589 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
1590 src_stride_x,
1591 src_step_x,
1592 src_stride_y,
1593 src_step_y,
1594 src_stride_z,
1595 src_step_z,
1596 src_stride_w,
1597 src_step_w,
1598 src_offset_first_element_in_bytes,
1599 dst_ptr,
1600 dst_stride_x,
1601 dst_step_x,
1602 dst_stride_y,
1603 dst_step_y,
1604 dst_stride_z,
1605 dst_step_z,
1606 dst_stride_w,
1607 dst_step_w,
1608 dst_offset_first_element_in_bytes);
1609}
1610
1611//! @cond Doxygen_Suppress
1612/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
1613 *
1614 * @note Data layout supported: NHWC
1615 * @note Data type supported: F32/F16
1616 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1617 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1618 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1619 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1620 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1621 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1622 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1623 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1624 *
1625 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1626 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1627 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1628 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1629 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1630 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1631 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1632 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1633 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1634 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1635 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1636 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1637 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1638 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1639 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1640 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1641 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1642 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1643 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1644 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1645 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001646//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001647__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
1648 TENSOR4D(src, BUFFER),
1649 TENSOR4D(dst, BUFFER))
1650{
1651 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
1652 src_stride_x,
1653 src_step_x,
1654 src_stride_y,
1655 src_step_y,
1656 src_stride_z,
1657 src_step_z,
1658 src_stride_w,
1659 src_step_w,
1660 src_offset_first_element_in_bytes,
1661 dst_ptr,
1662 dst_stride_x,
1663 dst_step_x,
1664 dst_stride_y,
1665 dst_step_y,
1666 dst_stride_z,
1667 dst_step_z,
1668 dst_stride_w,
1669 dst_step_w,
1670 dst_offset_first_element_in_bytes);
1671}
1672
1673//! @cond Doxygen_Suppress
1674/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
1675 *
1676 * @note Data layout supported: NHWC
1677 * @note Data type supported: F32/F16
1678 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1679 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1680 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1681 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1682 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1683 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1684 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1685 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1686 *
1687 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1688 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1689 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1690 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1691 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1692 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1693 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1694 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1695 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1696 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1697 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1698 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1699 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1700 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1701 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1702 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1703 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1704 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1705 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1706 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1707 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001708//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001709__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
1710 TENSOR4D(src, BUFFER),
1711 TENSOR4D(dst, BUFFER))
1712{
1713 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
1714 src_stride_x,
1715 src_step_x,
1716 src_stride_y,
1717 src_step_y,
1718 src_stride_z,
1719 src_step_z,
1720 src_stride_w,
1721 src_step_w,
1722 src_offset_first_element_in_bytes,
1723 dst_ptr,
1724 dst_stride_x,
1725 dst_step_x,
1726 dst_stride_y,
1727 dst_step_y,
1728 dst_stride_z,
1729 dst_step_z,
1730 dst_stride_w,
1731 dst_step_w,
1732 dst_offset_first_element_in_bytes);
1733}
1734
1735//! @cond Doxygen_Suppress
1736/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
1737 *
1738 * @note Data layout supported: NHWC
1739 * @note Data type supported: F32/F16
1740 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
1741 * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
1742 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
1743 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
1744 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1745 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1746 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1747 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1748 *
1749 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
1750 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1751 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1752 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1753 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1754 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1755 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1756 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1757 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1758 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1759 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1760 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1761 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1762 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1763 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1764 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1765 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
1766 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1767 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1768 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1769 */
Gian Marco Iodice598e3a82021-04-13 15:53:20 +01001770//! @endcond
Gian Marco Iodice534b8892021-04-01 16:17:16 +01001771__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
1772 TENSOR4D(src, BUFFER),
1773 TENSOR4D(dst, BUFFER))
1774{
1775 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
1776 src_stride_x,
1777 src_step_x,
1778 src_stride_y,
1779 src_step_y,
1780 src_stride_z,
1781 src_step_z,
1782 src_stride_w,
1783 src_step_w,
1784 src_offset_first_element_in_bytes,
1785 dst_ptr,
1786 dst_stride_x,
1787 dst_step_x,
1788 dst_stride_y,
1789 dst_step_y,
1790 dst_stride_z,
1791 dst_step_z,
1792 dst_stride_w,
1793 dst_step_w,
1794 dst_offset_first_element_in_bytes);
1795}
1796#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001797
1798#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1799/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
1800 *
1801 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1802 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1803 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1804 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1805 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001806 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001807 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001808 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001809 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1810 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1811 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1812 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1813 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1814 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1815 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1816 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1817 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1818 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1819 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1820 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1821 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1822 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1823 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001824 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1825 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001826 */
1827__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
1828 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001829 TENSOR3D_DECLARATION(dst),
1830 uint src_stride_w,
1831 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001832{
1833 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
1834 src_stride_x,
1835 src_step_x,
1836 src_stride_y,
1837 src_step_y,
1838 src_stride_z,
1839 src_step_z,
1840 src_offset_first_element_in_bytes,
1841 dst_ptr,
1842 dst_stride_x,
1843 dst_step_x,
1844 dst_stride_y,
1845 dst_step_y,
1846 dst_stride_z,
1847 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001848 dst_offset_first_element_in_bytes,
1849 src_stride_w,
1850 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001851}
1852
1853/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
1854 *
1855 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1856 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1857 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1858 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1859 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001860 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001861 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001862 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001863 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1864 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1865 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1866 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1867 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1868 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1869 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1870 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1871 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1872 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1873 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1874 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1875 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1876 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1877 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001878 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1879 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001880 */
1881__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
1882 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001883 TENSOR3D_DECLARATION(dst),
1884 uint src_stride_w,
1885 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001886{
1887 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
1888 src_stride_x,
1889 src_step_x,
1890 src_stride_y,
1891 src_step_y,
1892 src_stride_z,
1893 src_step_z,
1894 src_offset_first_element_in_bytes,
1895 dst_ptr,
1896 dst_stride_x,
1897 dst_step_x,
1898 dst_stride_y,
1899 dst_step_y,
1900 dst_stride_z,
1901 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001902 dst_offset_first_element_in_bytes,
1903 src_stride_w,
1904 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001905}
1906
1907/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
1908 *
1909 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1910 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1911 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
1912 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1913 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001914 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001915 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001916 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001917 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1918 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1919 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1920 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1921 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1922 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1923 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1924 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1925 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1926 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1927 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1928 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1929 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1930 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1931 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001932 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1933 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001934 */
1935__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
1936 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001937 TENSOR3D_DECLARATION(dst),
1938 uint src_stride_w,
1939 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001940{
1941 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
1942 src_stride_x,
1943 src_step_x,
1944 src_stride_y,
1945 src_step_y,
1946 src_stride_z,
1947 src_step_z,
1948 src_offset_first_element_in_bytes,
1949 dst_ptr,
1950 dst_stride_x,
1951 dst_step_x,
1952 dst_stride_y,
1953 dst_step_y,
1954 dst_stride_z,
1955 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001956 dst_offset_first_element_in_bytes,
1957 src_stride_w,
1958 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001959}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001960
1961/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
1962 *
1963 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1964 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1965 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1966 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1967 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001968 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001969 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001970 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001971 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1972 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1973 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1974 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1975 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1976 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1977 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1978 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1979 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1980 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1981 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1982 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1983 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1984 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1985 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001986 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1987 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001988 */
1989__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
1990 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001991 TENSOR3D_DECLARATION(dst),
1992 uint src_stride_w,
1993 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01001994{
1995 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
1996 src_stride_x,
1997 src_step_x,
1998 src_stride_y,
1999 src_step_y,
2000 src_stride_z,
2001 src_step_z,
2002 src_offset_first_element_in_bytes,
2003 dst_ptr,
2004 dst_stride_x,
2005 dst_step_x,
2006 dst_stride_y,
2007 dst_step_y,
2008 dst_stride_z,
2009 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002010 dst_offset_first_element_in_bytes,
2011 src_stride_w,
2012 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002013}
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002014#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
2015
2016#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
2017/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
2018 *
2019 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2020 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2021 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2022 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2023 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002024 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002025 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002026 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002027 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2028 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2029 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2030 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2031 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2032 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2033 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2034 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2035 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2036 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2037 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2038 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2039 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2040 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2041 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002042 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2043 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002044 */
2045__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
2046 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002047 TENSOR3D_DECLARATION(dst),
2048 uint src_stride_w,
2049 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002050{
2051 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2052 src_stride_x,
2053 src_step_x,
2054 src_stride_y,
2055 src_step_y,
2056 src_stride_z,
2057 src_step_z,
2058 src_offset_first_element_in_bytes,
2059 dst_ptr,
2060 dst_stride_x,
2061 dst_step_x,
2062 dst_stride_y,
2063 dst_step_y,
2064 dst_stride_z,
2065 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002066 dst_offset_first_element_in_bytes,
2067 src_stride_w,
2068 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002069}
2070
2071/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
2072 *
2073 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2074 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2075 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2076 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2077 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002078 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002079 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002080 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002081 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2082 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2083 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2084 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2085 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2086 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2087 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2088 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2089 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2090 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2091 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2092 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2093 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2094 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2095 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002096 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2097 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002098 */
2099__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
2100 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002101 TENSOR3D_DECLARATION(dst),
2102 uint src_stride_w,
2103 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002104{
2105 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2106 src_stride_x,
2107 src_step_x,
2108 src_stride_y,
2109 src_step_y,
2110 src_stride_z,
2111 src_step_z,
2112 src_offset_first_element_in_bytes,
2113 dst_ptr,
2114 dst_stride_x,
2115 dst_step_x,
2116 dst_stride_y,
2117 dst_step_y,
2118 dst_stride_z,
2119 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002120 dst_offset_first_element_in_bytes,
2121 src_stride_w,
2122 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002123}
2124
2125/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
2126 *
2127 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2128 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2129 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2130 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2131 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002132 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002133 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002134 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002135 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2136 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2137 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2138 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2139 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2140 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2141 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2142 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2143 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2144 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2145 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2146 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2147 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2148 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2149 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002150 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2151 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002152 */
2153__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
2154 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002155 TENSOR3D_DECLARATION(dst),
2156 uint src_stride_w,
2157 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002158{
2159 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2160 src_stride_x,
2161 src_step_x,
2162 src_stride_y,
2163 src_step_y,
2164 src_stride_z,
2165 src_step_z,
2166 src_offset_first_element_in_bytes,
2167 dst_ptr,
2168 dst_stride_x,
2169 dst_step_x,
2170 dst_stride_y,
2171 dst_step_y,
2172 dst_stride_z,
2173 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002174 dst_offset_first_element_in_bytes,
2175 src_stride_w,
2176 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002177}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002178
2179/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
2180 *
2181 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2182 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2183 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2184 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2185 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002186 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002187 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002188 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002189 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2190 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2191 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2192 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2193 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2194 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2195 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2196 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2197 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2198 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2199 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2200 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2201 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2202 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2203 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002204 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2205 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002206 */
2207__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
2208 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002209 TENSOR3D_DECLARATION(dst),
2210 uint src_stride_w,
2211 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002212{
2213 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2214 src_stride_x,
2215 src_step_x,
2216 src_stride_y,
2217 src_step_y,
2218 src_stride_z,
2219 src_step_z,
2220 src_offset_first_element_in_bytes,
2221 dst_ptr,
2222 dst_stride_x,
2223 dst_step_x,
2224 dst_stride_y,
2225 dst_step_y,
2226 dst_stride_z,
2227 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002228 dst_offset_first_element_in_bytes,
2229 src_stride_w,
2230 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002231}
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002232#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002233#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)