blob: 8a27a7ecada51f673f06308080234fd9034bc918 [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Giorgio Arena049989a2021-03-22 17:02:26 +00002 * Copyright (c) 2018-2021 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Giorgio Arena2d1a8352020-10-26 15:04:08 +000026#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond) \
27 ({ \
28 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
29 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
30 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
31 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
32 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \
33 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010034 })
35
Giorgio Arena2d1a8352020-10-26 15:04:08 +000036#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond) \
37 ({ \
38 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
39 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
40 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
41 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
42 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \
43 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010044 })
45
Giorgio Arena2d1a8352020-10-26 15:04:08 +000046#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond) \
47 ({ \
48 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
49 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
50 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
51 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
52 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \
53 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \
54 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \
55 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010056 })
57
Giorgio Arena2d1a8352020-10-26 15:04:08 +000058#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond) \
59 ({ \
60 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
61 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
62 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
63 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
64 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \
65 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \
66 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \
67 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010068 })
69
Gian Marco Iodiced28b7512018-07-06 12:59:28 +010070#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
71 ({ \
72 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
73 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
74 comm_fact.s2 = 2.5f * tmp.s3; \
75 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
76 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
77 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
78 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
79 \
80 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
81 out.s1 = comm_fact.s0 + comm_fact.s1; \
82 out.s2 = comm_fact.s0 - comm_fact.s1; \
83 out.s3 = comm_fact.s3 + comm_fact.s4; \
84 out.s4 = comm_fact.s4 - comm_fact.s3; \
85 out.s5 = comm_fact.s5 + comm_fact.s6; \
86 out.s6 = comm_fact.s5 - comm_fact.s6; \
87 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
88 })
89
Michele Di Giorgiof955d512019-02-27 14:26:51 +000090#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
91 ({ \
92 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
93 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
94 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
95 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
96 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
97 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
98 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
99 out.s1 = comm_fact.s0 - comm_fact.s1; \
100 out.s2 = comm_fact.s0 + comm_fact.s1; \
101 out.s3 = comm_fact.s2 - comm_fact.s3; \
102 out.s4 = comm_fact.s2 + comm_fact.s3; \
103 out.s5 = comm_fact.s4 - comm_fact.s5; \
104 out.s6 = comm_fact.s4 + comm_fact.s5; \
105 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
106 })
107
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100108#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
109/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
110 *
111 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
112 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
113 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
114 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
115 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
116 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100117 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100118 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100119 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100120 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
121 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
122 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
123 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
124 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
125 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
126 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
127 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
128 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
129 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
130 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
131 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
132 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
133 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
134 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100135 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
136 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100137 */
138__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
139 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100140 TENSOR3D_DECLARATION(dst),
141 uint src_stride_w,
142 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100143{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100144 const int x = get_global_id(0);
145 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000146#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100147 const int z = get_global_id(2) % SRC_DEPTH;
148 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000149#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000150 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000151#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100152
153 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100155 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000156#else /* defined(SRC_DEPTH) */
157 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
158#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100159
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100160 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100161
162#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100163 VEC_DATA_TYPE(DATA_TYPE, 4)
164 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100165#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100166 VEC_DATA_TYPE(DATA_TYPE, 4)
167 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
168 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
169 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
170 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100171#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 VEC_DATA_TYPE(DATA_TYPE, 4)
173 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
174 VEC_DATA_TYPE(DATA_TYPE, 4)
175 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
176 VEC_DATA_TYPE(DATA_TYPE, 4)
177 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
178 VEC_DATA_TYPE(DATA_TYPE, 4)
179 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100180#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
181
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100182 VEC_DATA_TYPE(DATA_TYPE, 4)
183 tmp0 = in_row0;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100184
185#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
186 tmp0 -= in_row2;
187#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
188
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100189 DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
190 DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
191 DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
192 DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100193
194#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100195 VEC_DATA_TYPE(DATA_TYPE, 4)
196 tmp1 = in_row1 + in_row2;
197 VEC_DATA_TYPE(DATA_TYPE, 4)
198 tmp2 = in_row2 - in_row1;
199 VEC_DATA_TYPE(DATA_TYPE, 4)
200 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100201
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100202 DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
203 DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
204 DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
205 DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100206
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100207 DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
208 DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
209 DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
210 DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100211
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100212 DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
213 DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
214 DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
215 DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100216#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
217
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000218#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100219 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000220#else /* defined(SRC_DEPTH) */
221 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
222#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100223
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100224 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
225 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
226 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
227 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100228
229#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100230 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
231 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
232 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
233 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
234 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
235 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
236 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
237 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
238 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
239 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
240 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
241 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100242#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
243}
244
245/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
246 *
247 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
248 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
249 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
250 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
251 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
252 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100253 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100254 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100255 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100256 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
257 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
258 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
259 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
260 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
261 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
262 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
263 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
264 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
265 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
266 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
267 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
268 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
269 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
270 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100271 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
272 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100273 */
274__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
275 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100276 TENSOR3D_DECLARATION(dst),
277 uint src_stride_w,
278 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100279{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100280 const int x = get_global_id(0);
281 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000282#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100283 const int z = (get_global_id(2) * 2) % SRC_DEPTH;
284 const int b = (get_global_id(2) * 2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000285#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000286 const int z = get_global_id(2) * 2;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000287#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100288
289 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000290#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100291 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000292#else /* defined(SRC_DEPTH) */
293 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
294#endif /* defined(SRC_DEPTH) */
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100295 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100296
297#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100298 VEC_DATA_TYPE(DATA_TYPE, 4)
299 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100300#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100301 VEC_DATA_TYPE(DATA_TYPE, 4)
302 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
303 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
304 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
305 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100306#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100307 VEC_DATA_TYPE(DATA_TYPE, 4)
308 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
309 VEC_DATA_TYPE(DATA_TYPE, 4)
310 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
311 VEC_DATA_TYPE(DATA_TYPE, 4)
312 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
313 VEC_DATA_TYPE(DATA_TYPE, 4)
314 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100315#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
316
317 src_addr += src_stride_z;
318#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100319 VEC_DATA_TYPE(DATA_TYPE, 4)
320 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100321#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100322 VEC_DATA_TYPE(DATA_TYPE, 4)
323 in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
324 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
325 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
326 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100327#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100328 VEC_DATA_TYPE(DATA_TYPE, 4)
329 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
330 VEC_DATA_TYPE(DATA_TYPE, 4)
331 in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
332 VEC_DATA_TYPE(DATA_TYPE, 4)
333 in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
334 VEC_DATA_TYPE(DATA_TYPE, 4)
335 in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100336#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
337
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100338 VEC_DATA_TYPE(DATA_TYPE, 4)
339 tmp0 = in_row0;
340 VEC_DATA_TYPE(DATA_TYPE, 4)
341 tmp4 = in_row4;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100342
343#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
344 tmp0 -= in_row2;
345 tmp4 -= in_row6;
346#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
347
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100348 VEC_DATA_TYPE(DATA_TYPE, 2)
349 out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
350 VEC_DATA_TYPE(DATA_TYPE, 2)
351 out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
352 VEC_DATA_TYPE(DATA_TYPE, 2)
353 out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
354 VEC_DATA_TYPE(DATA_TYPE, 2)
355 out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100356
357#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100358 VEC_DATA_TYPE(DATA_TYPE, 4)
359 tmp1 = in_row1 + in_row2;
360 VEC_DATA_TYPE(DATA_TYPE, 4)
361 tmp2 = in_row2 - in_row1;
362 VEC_DATA_TYPE(DATA_TYPE, 4)
363 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100364
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100365 VEC_DATA_TYPE(DATA_TYPE, 4)
366 tmp5 = in_row5 + in_row6;
367 VEC_DATA_TYPE(DATA_TYPE, 4)
368 tmp6 = in_row6 - in_row5;
369 VEC_DATA_TYPE(DATA_TYPE, 4)
370 tmp7 = in_row5 - in_row7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100371
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100372 VEC_DATA_TYPE(DATA_TYPE, 2)
373 out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
374 VEC_DATA_TYPE(DATA_TYPE, 2)
375 out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
376 VEC_DATA_TYPE(DATA_TYPE, 2)
377 out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
378 VEC_DATA_TYPE(DATA_TYPE, 2)
379 out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100380
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100381 VEC_DATA_TYPE(DATA_TYPE, 2)
382 out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
383 VEC_DATA_TYPE(DATA_TYPE, 2)
384 out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
385 VEC_DATA_TYPE(DATA_TYPE, 2)
386 out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
387 VEC_DATA_TYPE(DATA_TYPE, 2)
388 out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100389
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100390 VEC_DATA_TYPE(DATA_TYPE, 2)
391 out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
392 VEC_DATA_TYPE(DATA_TYPE, 2)
393 out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
394 VEC_DATA_TYPE(DATA_TYPE, 2)
395 out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
396 VEC_DATA_TYPE(DATA_TYPE, 2)
397 out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100398#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
399
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000400#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100401 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000402#else /* defined(SRC_DEPTH) */
403 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
404#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100405
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100406 vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
407 vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
408 vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
409 vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100410
411#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100412 vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
413 vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
414 vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
415 vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
416 vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
417 vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
418 vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
419 vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
420 vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
421 vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
422 vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
423 vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100424#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
425}
426
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100427/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100428 *
429 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
430 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
431 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
432 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
433 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
434 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100435 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100436 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100437 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100438 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
439 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
440 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
441 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
442 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
443 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
444 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
445 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
446 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
447 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
448 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
449 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
450 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
451 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
452 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100453 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
454 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100455 */
456__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
457 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100458 TENSOR3D_DECLARATION(dst),
459 uint src_stride_w,
460 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100461{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100462 const int x = get_global_id(0);
463 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000464#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100465 const int z = get_global_id(2) % SRC_DEPTH;
466 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000467#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000468 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000469#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100470
471 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000472#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100473 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000474#else /* defined(SRC_DEPTH) */
475 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
476#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100477
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100478 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100479
480#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
481 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100482 VEC_DATA_TYPE(DATA_TYPE, 4)
483 d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
484 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
485 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
486 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
487 VEC_DATA_TYPE(DATA_TYPE, 2)
488 d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
489 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100490#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
491 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100492 VEC_DATA_TYPE(DATA_TYPE, 4)
493 d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
494 VEC_DATA_TYPE(DATA_TYPE, 2)
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000495 d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100496#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
497
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100498 DATA_TYPE out0 = 0.0f;
499 DATA_TYPE out1 = 0.0f;
500 DATA_TYPE out2 = 0.0f;
501 DATA_TYPE out3 = 0.0f;
502 DATA_TYPE out4 = 0.0f;
503 DATA_TYPE out5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
505 // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
506 out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
507 out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
508 out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
509 out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
510 out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
511 out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
512
513#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
514 // Row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100515 VEC_DATA_TYPE(DATA_TYPE, 4)
516 d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
517 VEC_DATA_TYPE(DATA_TYPE, 2)
518 d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100519
520 // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100521 DATA_TYPE k0 = d41.s0;
522 DATA_TYPE k1 = d41.s0;
523 DATA_TYPE k2 = d41.s0;
524 DATA_TYPE k3 = d41.s0;
525 DATA_TYPE k4 = d41.s0;
526 DATA_TYPE k5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100527
528 k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
529 k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
530 k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
531 k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
532 k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
533 k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
534
535 out0 += k0;
536 out1 += k1;
537 out2 += k2;
538 out3 += k3;
539 out4 += k4;
540 out5 += k5;
541
542 // Row2
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100543 VEC_DATA_TYPE(DATA_TYPE, 4)
544 d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
545 VEC_DATA_TYPE(DATA_TYPE, 2)
546 d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100547
548 out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
549 out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
550 out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
551 out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
552 out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
553 out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
554#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
555
556 // Compute destination address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000557#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100558 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000559#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000560 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000561#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100562
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100563 uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100564
565 *(dst_addr) = out0;
566 dst_addr += dst_plane_stride;
567 *(dst_addr) = out1;
568 dst_addr += dst_plane_stride;
569 *(dst_addr) = out2;
570 dst_addr += dst_plane_stride;
571 *(dst_addr) = out3;
572 dst_addr += dst_plane_stride;
573 *(dst_addr) = out4;
574 dst_addr += dst_plane_stride;
575 *(dst_addr) = out5;
576 dst_addr += dst_plane_stride;
577
578#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100579 DATA_TYPE out6 = k0;
580 DATA_TYPE out7 = k1;
581 DATA_TYPE out8 = k2;
582 DATA_TYPE out9 = k3;
583 DATA_TYPE out10 = k4;
584 DATA_TYPE out11 = k5;
585 DATA_TYPE out12 = k0;
586 DATA_TYPE out13 = k1;
587 DATA_TYPE out14 = k2;
588 DATA_TYPE out15 = k3;
589 DATA_TYPE out16 = k4;
590 DATA_TYPE out17 = k5;
591 DATA_TYPE out18 = k0;
592 DATA_TYPE out19 = k1;
593 DATA_TYPE out20 = k2;
594 DATA_TYPE out21 = k3;
595 DATA_TYPE out22 = k4;
596 DATA_TYPE out23 = k5;
597 DATA_TYPE out24 = k0;
598 DATA_TYPE out25 = k1;
599 DATA_TYPE out26 = k2;
600 DATA_TYPE out27 = k3;
601 DATA_TYPE out28 = k4;
602 DATA_TYPE out29 = k5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100603
604 // Row1
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100605 VEC_DATA_TYPE(DATA_TYPE, 4)
606 d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
607 VEC_DATA_TYPE(DATA_TYPE, 2)
608 d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100609
610 // Row3
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100611 VEC_DATA_TYPE(DATA_TYPE, 4)
612 d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
613 VEC_DATA_TYPE(DATA_TYPE, 2)
614 d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100615
616 // Compute common parts for the channels between [6, 29]
617 // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
618 // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100619 DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
620 DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
621 DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
622 DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
623 DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
624 DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
625 DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
626 DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
627 DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
628 DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
629 DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
630 DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100631
632 // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
633 // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100634 DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
635 DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
636 DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
637 DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
638 DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
639 DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
640 DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
641 DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
642 DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
643 DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
644 DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
645 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100646
647 out6 += part0 - part1;
648 out12 += part0 + part1;
649 out7 += part2 + part3 + part4 + part5;
650 out8 += part2 - part3 + part4 - part5;
651 out13 += part2 + part3 - part4 - part5;
652 out14 += part2 - part3 - part4 + part5;
653 out9 += part6 + part7 + part8 + part9;
654 out10 += part6 - part7 + part8 - part9;
655 out15 += part6 - part7 - part8 + part9;
656 out16 += part6 + part7 - part8 - part9;
657 out11 += part10 + part11;
658 out17 += part10 - part11;
659
660 out18 += part13 - part12;
661 out24 += part13 + part12;
662 out19 += part14 + part15 + part16 + part17;
663 out20 += part14 - part15 + part16 - part17;
664 out25 += part14 - part15 - part16 + part17;
665 out26 += part14 + part15 - part16 - part17;
666 out21 += part18 + part19 + part20 + part21;
667 out22 += part18 - part19 + part20 - part21;
668 out27 += part18 - part19 - part20 + part21;
669 out28 += part18 + part19 - part20 - part21;
670 out23 += part22 + part23;
671 out29 += part22 - part23;
672
673 *(dst_addr) = out6;
674 dst_addr += dst_plane_stride;
675 *(dst_addr) = out7;
676 dst_addr += dst_plane_stride;
677 *(dst_addr) = out8;
678 dst_addr += dst_plane_stride;
679 *(dst_addr) = out9;
680 dst_addr += dst_plane_stride;
681 *(dst_addr) = out10;
682 dst_addr += dst_plane_stride;
683 *(dst_addr) = out11;
684 dst_addr += dst_plane_stride;
685 *(dst_addr) = out12;
686 dst_addr += dst_plane_stride;
687 *(dst_addr) = out13;
688 dst_addr += dst_plane_stride;
689 *(dst_addr) = out14;
690 dst_addr += dst_plane_stride;
691 *(dst_addr) = out15;
692 dst_addr += dst_plane_stride;
693 *(dst_addr) = out16;
694 dst_addr += dst_plane_stride;
695 *(dst_addr) = out17;
696 dst_addr += dst_plane_stride;
697
698 *(dst_addr) = out18;
699 dst_addr += dst_plane_stride;
700 *(dst_addr) = out19;
701 dst_addr += dst_plane_stride;
702 *(dst_addr) = out20;
703 dst_addr += dst_plane_stride;
704 *(dst_addr) = out21;
705 dst_addr += dst_plane_stride;
706 *(dst_addr) = out22;
707 dst_addr += dst_plane_stride;
708 *(dst_addr) = out23;
709 dst_addr += dst_plane_stride;
710 *(dst_addr) = out24;
711 dst_addr += dst_plane_stride;
712 *(dst_addr) = out25;
713 dst_addr += dst_plane_stride;
714 *(dst_addr) = out26;
715 dst_addr += dst_plane_stride;
716 *(dst_addr) = out27;
717 dst_addr += dst_plane_stride;
718 *(dst_addr) = out28;
719 dst_addr += dst_plane_stride;
720 *(dst_addr) = out29;
721 dst_addr += dst_plane_stride;
722
723 // Row5
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 VEC_DATA_TYPE(DATA_TYPE, 4)
725 d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
726 VEC_DATA_TYPE(DATA_TYPE, 2)
727 d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100728
729 // Channels [30, 35]
730 out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
731 out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
732 out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
733 out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
734 out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
735 out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
736
737 *(dst_addr) = out0;
738 dst_addr += dst_plane_stride;
739 *(dst_addr) = out1;
740 dst_addr += dst_plane_stride;
741 *(dst_addr) = out2;
742 dst_addr += dst_plane_stride;
743 *(dst_addr) = out3;
744 dst_addr += dst_plane_stride;
745 *(dst_addr) = out4;
746 dst_addr += dst_plane_stride;
747 *(dst_addr) = out5;
748 dst_addr += dst_plane_stride;
749#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
750}
751
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100752/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
753 *
754 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
755 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
756 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
757 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
758 * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
759 * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
760 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
761 *
762 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
763 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
764 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
765 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
766 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
767 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
768 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
769 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
770 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
771 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
772 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
773 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
774 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
775 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
776 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
777 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
778 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
779 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
780 */
781__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
782 TENSOR3D_DECLARATION(src),
783 TENSOR3D_DECLARATION(dst),
784 uint src_stride_w,
785 uint dst_stride_w)
786{
787 const int x = get_global_id(0);
788 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000789#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100790 const int z = get_global_id(2) % SRC_DEPTH;
791 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000792#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000793 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000794#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100795
796 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000797#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100798 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000799#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000800 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000801#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100802 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
803
804 // Load input tile
805#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
806 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
807#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
808 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
809 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
810 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
811 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
812 *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
813 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
814 *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
815 *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
816#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
817 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
818 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
819 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
820 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
821 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
822 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
823 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
824 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
825#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
826
827 // Calculate common factors for intermediate tensor
828 VEC_DATA_TYPE(DATA_TYPE, 8)
829 tmp0 = in_row0;
830 VEC_DATA_TYPE(DATA_TYPE, 8)
831 comm_fact0 = 0.0f;
832
833#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena049989a2021-03-22 17:02:26 +0000834 comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
835 tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100836
837 VEC_DATA_TYPE(DATA_TYPE, 8)
Giorgio Arena049989a2021-03-22 17:02:26 +0000838 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100839 VEC_DATA_TYPE(DATA_TYPE, 8)
Giorgio Arena049989a2021-03-22 17:02:26 +0000840 comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100841
842 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
843 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
844
Giorgio Arena049989a2021-03-22 17:02:26 +0000845 comm_fact0 = (DATA_TYPE)2.5f * in_row3;
846 comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100847
848 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
849 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
850
Giorgio Arena049989a2021-03-22 17:02:26 +0000851 comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
852 comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100853
854 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
855 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
Giorgio Arena049989a2021-03-22 17:02:26 +0000856 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100857#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
858
859 // Calculate output rows (reuse comm_fact0 vector)
860 VEC_DATA_TYPE(DATA_TYPE, 8)
861 out0;
862
863 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
864
865#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
866 VEC_DATA_TYPE(DATA_TYPE, 8)
867 out1, out2, out3, out4, out5, out6, out7;
868
869 OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
870 OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
871 OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
872 OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
873 OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
874 OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
875 OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
876#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
877
878 // Store values across the channels
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000879#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100880 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000881#else /* defined(SRC_DEPTH) */
882 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
883#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100884
885 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
886 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
887 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
888 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
889 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
890 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
891 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
892 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
893
894#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
895 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
896 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
897 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
898 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
899 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
900 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
901 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
902 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
903 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
904 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
905 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
906 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
907 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
908 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
909 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
910 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
911 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
912 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
913 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
914 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
915 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
916 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
917 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
918 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
919 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
920 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
921 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
922 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
923 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
924 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
925 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
926 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
927 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
928 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
929 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
930 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
931 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
932 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
933 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
934 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
935 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
936 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
937 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
938 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
939 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
940 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
941 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
942 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
943 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
944 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
945 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
946 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
947 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
948 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
949 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
950 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
951#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
952}
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100953
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000954#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100955/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100956 *
957 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
958 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
959 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
960 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100961 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
962 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
963 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
964 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100965 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100966 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100967 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100968 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
969 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
970 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
971 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
972 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
973 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
974 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
975 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
976 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
977 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
978 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
979 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
980 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
981 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
982 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100983 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
984 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100985 */
986__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
987 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100988 TENSOR3D_DECLARATION(dst),
989 uint src_stride_w,
990 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100991{
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100992 // Index channel
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100993 const int x = get_global_id(0);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100994 // Index width
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100995 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000996#if defined(NUM_TILES_Y)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100997 // Index height
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100998 const int z = get_global_id(2) % NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100999 // Index batch size
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001000 const int b = get_global_id(2) / NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001001#else // defined(NUM_TILES_Y)
1002 // Index height
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001003 const int z = get_global_id(2);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001004#endif // defined(NUM_TILES_Y)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001005
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001006#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001007 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001008#else // defined(NUM_TILES_Y)
1009 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
1010#endif // defined(NUM_TILES_Y)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001011
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001012 // Origin coordinates for the width (y) and height (z) in the input tensor
Giorgio Arena149fdf32018-07-04 17:03:33 +01001013 int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
1014 int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001015 int4 z_coord0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
1016 int2 z_coord1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001017
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001018 // Coordinates to use to avoid out-of-bound reads
1019 int4 y_coord_valid0 = clamp(y_coord0, (int4)0, (int4)((int)SRC_DIM_1 - 1));
1020 int2 y_coord_valid1 = clamp(y_coord1, (int2)0, (int2)((int)SRC_DIM_1 - 1));
1021 int4 z_coord_valid0 = clamp(z_coord0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
1022 int2 z_coord_valid1 = clamp(z_coord1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
1023
1024 // Boundary conditions
1025 int4 y_cond0 = y_coord_valid0 == y_coord0;
1026 int2 y_cond1 = y_coord_valid1 == y_coord1;
1027 int4 z_cond0 = z_coord_valid0 == z_coord0;
1028 int2 z_cond1 = z_coord_valid1 == z_coord1;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001029
Giorgio Arena149fdf32018-07-04 17:03:33 +01001030#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001031 DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1032 DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1033 DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1034 DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1035 DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1036 DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001037
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001038 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d0, y_cond, z_cond0.s0);
Giorgio Arena149fdf32018-07-04 17:03:33 +01001039#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001040 DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1041 DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1042 DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1043 DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1044 DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1045 DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001046
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001047 FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(DATA_TYPE, d0, y_cond0.s0, z_cond);
Giorgio Arena149fdf32018-07-04 17:03:33 +01001048#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1049
Giorgio Arena149fdf32018-07-04 17:03:33 +01001050#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena049989a2021-03-22 17:02:26 +00001051 DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1052 DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1053 DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1054 DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1055 DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1056 DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1057
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001058 DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1059 DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1060 DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1061 DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1062 DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1063 DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001064
Giorgio Arena049989a2021-03-22 17:02:26 +00001065 DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1066 DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1067 DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1068 DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1069 DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1070 DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1071
1072 DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1073 DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1074 DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1075 DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1076 DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1077 DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1078
1079 DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1080 DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1081 DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1082 DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1083 DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1084 DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1085
1086 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d1, y_cond, z_cond0.s1);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001087 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d2, y_cond, z_cond0.s2);
Giorgio Arena049989a2021-03-22 17:02:26 +00001088 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d3, y_cond, z_cond0.s3);
1089 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d4, y_cond, z_cond1.s0);
1090 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d5, y_cond, z_cond1.s1);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001091
Giorgio Arena049989a2021-03-22 17:02:26 +00001092 DATA_TYPE k0, k1, k2, k3, k4, k5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001093
Giorgio Arena049989a2021-03-22 17:02:26 +00001094 DATA_TYPE part00, part01, part02, part03, part04, part05;
1095 DATA_TYPE part10, part11, part12, part13, part14, part15;
1096 DATA_TYPE part20, part21, part22, part23, part24, part25;
1097 DATA_TYPE part30, part31, part32, part33, part34, part35;
1098 DATA_TYPE part40, part41, part42, part43, part44, part45;
1099 DATA_TYPE part50, part51, part52, part53, part54, part55;
1100
1101#define COMMON_OPS_0(i) \
1102 k0 = d2##i - 4.f * d0##i; \
1103 k1 = d3##i - 4.f * d1##i; \
1104 k2 = d4##i - 4.f * d2##i; \
1105 k3 = d5##i - 4.f * d3##i; \
1106 k4 = d3##i - d1##i; \
1107 k4 = k4 + k4; \
1108 k5 = d4##i - d2##i; \
1109 part0##i = k2 - k0; \
1110 part1##i = k2 + k1; \
1111 part2##i = k2 - k1; \
1112 part3##i = k5 + k4; \
1113 part4##i = k5 - k4; \
1114 part5##i = k3 - k1;
1115
1116#define COMMON_OPS_1(i) \
1117 k0 = part##i##2 - 4.f * part##i##0; \
1118 k1 = part##i##3 - 4.f * part##i##1; \
1119 k2 = part##i##4 - 4.f * part##i##2; \
1120 k3 = part##i##5 - 4.f * part##i##3; \
1121 k4 = part##i##3 - part##i##1; \
1122 k4 = k4 + k4; \
1123 k5 = part##i##4 - part##i##2; \
1124 DATA_TYPE out##i##0 = k2 - k0; \
1125 DATA_TYPE out##i##1 = k2 + k1; \
1126 DATA_TYPE out##i##2 = k2 - k1; \
1127 DATA_TYPE out##i##3 = k5 + k4; \
1128 DATA_TYPE out##i##4 = k5 - k4; \
1129 DATA_TYPE out##i##5 = k3 - k1;
1130
1131 COMMON_OPS_0(0);
1132 COMMON_OPS_0(1);
1133 COMMON_OPS_0(2);
1134 COMMON_OPS_0(3);
1135 COMMON_OPS_0(4);
1136 COMMON_OPS_0(5);
1137
1138 COMMON_OPS_1(0);
1139 COMMON_OPS_1(1);
1140 COMMON_OPS_1(2);
1141 COMMON_OPS_1(3);
1142 COMMON_OPS_1(4);
1143 COMMON_OPS_1(5);
1144
1145#undef COMMON_OPS_0
1146#undef COMMON_OPS_1
1147
1148#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1149
1150 DATA_TYPE k0, k1, k2, k3, k4, k5;
1151 DATA_TYPE part0, part1, part2, part3, part4, part5;
1152
1153 part0 = 4.f * d00;
1154 part1 = 4.f * d01;
1155 part2 = 4.f * d02;
1156 part3 = 4.f * d03;
1157 part4 = 4.f * d04;
1158 part5 = 4.f * d05;
1159
1160 k0 = part2 - 4.f * part0;
1161 k1 = part3 - 4.f * part1;
1162 k2 = part4 - 4.f * part2;
1163 k3 = part5 - 4.f * part3;
1164 k4 = part3 - part1;
1165 k4 = k4 + k4;
1166 k5 = part4 - part2;
1167
1168 DATA_TYPE out00 = k2 - k0;
1169 DATA_TYPE out01 = k2 + k1;
1170 DATA_TYPE out02 = k2 - k1;
1171 DATA_TYPE out03 = k5 + k4;
1172 DATA_TYPE out04 = k5 - k4;
1173 DATA_TYPE out05 = k3 - k1;
1174
Giorgio Arena149fdf32018-07-04 17:03:33 +01001175#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1176
1177 // Compute destination address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001178#if defined(NUM_TILES_Y)
1179 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001180#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001181 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001182#endif // defined(NUM_TILES_Y)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001183
1184 uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001185
Giorgio Arena049989a2021-03-22 17:02:26 +00001186 *((__global DATA_TYPE *)dst_addr) = out00;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001187 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001188 *((__global DATA_TYPE *)dst_addr) = out01;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001189 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001190 *((__global DATA_TYPE *)dst_addr) = out02;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001191 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001192 *((__global DATA_TYPE *)dst_addr) = out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001193 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001194 *((__global DATA_TYPE *)dst_addr) = out04;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001195 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001196 *((__global DATA_TYPE *)dst_addr) = out05;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001197 dst_addr += dst_plane_stride;
1198
Giorgio Arena149fdf32018-07-04 17:03:33 +01001199#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001200 *((__global DATA_TYPE *)dst_addr) = out10;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001201 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001202 *((__global DATA_TYPE *)dst_addr) = out11;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001203 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001204 *((__global DATA_TYPE *)dst_addr) = out12;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001205 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001206 *((__global DATA_TYPE *)dst_addr) = out13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001207 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001208 *((__global DATA_TYPE *)dst_addr) = out14;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001209 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001210 *((__global DATA_TYPE *)dst_addr) = out15;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001211 dst_addr += dst_plane_stride;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001212
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001213 *((__global DATA_TYPE *)dst_addr) = out20;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001214 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001215 *((__global DATA_TYPE *)dst_addr) = out21;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001216 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001217 *((__global DATA_TYPE *)dst_addr) = out22;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001218 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001219 *((__global DATA_TYPE *)dst_addr) = out23;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001220 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001221 *((__global DATA_TYPE *)dst_addr) = out24;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001222 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001223 *((__global DATA_TYPE *)dst_addr) = out25;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001224 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001225
1226 *((__global DATA_TYPE *)dst_addr) = out30;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001227 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001228 *((__global DATA_TYPE *)dst_addr) = out31;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001229 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001230 *((__global DATA_TYPE *)dst_addr) = out32;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001231 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001232 *((__global DATA_TYPE *)dst_addr) = out33;
1233 dst_addr += dst_plane_stride;
1234 *((__global DATA_TYPE *)dst_addr) = out34;
1235 dst_addr += dst_plane_stride;
1236 *((__global DATA_TYPE *)dst_addr) = out35;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001237 dst_addr += dst_plane_stride;
1238
Giorgio Arena049989a2021-03-22 17:02:26 +00001239 *((__global DATA_TYPE *)dst_addr) = out40;
1240 dst_addr += dst_plane_stride;
1241 *((__global DATA_TYPE *)dst_addr) = out41;
1242 dst_addr += dst_plane_stride;
1243 *((__global DATA_TYPE *)dst_addr) = out42;
1244 dst_addr += dst_plane_stride;
1245 *((__global DATA_TYPE *)dst_addr) = out43;
1246 dst_addr += dst_plane_stride;
1247 *((__global DATA_TYPE *)dst_addr) = out44;
1248 dst_addr += dst_plane_stride;
1249 *((__global DATA_TYPE *)dst_addr) = out45;
1250 dst_addr += dst_plane_stride;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001251
Giorgio Arena049989a2021-03-22 17:02:26 +00001252 *((__global DATA_TYPE *)dst_addr) = out50;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001253 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001254 *((__global DATA_TYPE *)dst_addr) = out51;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001255 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001256 *((__global DATA_TYPE *)dst_addr) = out52;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001257 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001258 *((__global DATA_TYPE *)dst_addr) = out53;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001259 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001260 *((__global DATA_TYPE *)dst_addr) = out54;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001261 dst_addr += dst_plane_stride;
Giorgio Arena049989a2021-03-22 17:02:26 +00001262 *((__global DATA_TYPE *)dst_addr) = out55;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001263 dst_addr += dst_plane_stride;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001264#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001265}
1266
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001267/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
Giorgio Arena149fdf32018-07-04 17:03:33 +01001268 *
1269 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1270 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001271 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
1272 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001273 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
Giorgio Arena149fdf32018-07-04 17:03:33 +01001274 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001275 * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1276 * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001277 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arena149fdf32018-07-04 17:03:33 +01001278 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001279 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arena149fdf32018-07-04 17:03:33 +01001280 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1281 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1282 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1283 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1284 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1285 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1286 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1287 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1288 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1289 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1290 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1291 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1292 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1293 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1294 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001295 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1296 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001297 */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001298__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
Giorgio Arena149fdf32018-07-04 17:03:33 +01001299 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001300 TENSOR3D_DECLARATION(dst),
1301 uint src_stride_w,
1302 uint dst_stride_w)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001303{
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001304 const int x = get_global_id(0);
1305 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001306#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001307 const int z = get_global_id(2) % NUM_TILES_Y;
1308 const int b = get_global_id(2) / NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001309#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001310 const int z = get_global_id(2);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001311#endif // defined(NUM_TILES_Y)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001312
1313 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001314#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001315 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001316#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001317 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001318#endif // defined(NUM_TILES_Y)
1319
1320 // Origin coordinates for the width (y) and height (z) in the input tensor
1321 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1322 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1323
1324 // Coordinates to use to avoid out-of-bound reads
1325 int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
1326 int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
1327
1328 // Boundary conditions
1329 int8 y_cond0 = y_coord_valid0 == y_coord0;
1330 int8 z_cond0 = z_coord_valid0 == z_coord0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001331
1332#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001333
1334 // Load the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001335 VEC_DATA_TYPE(DATA_TYPE, 8)
1336 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001337 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1338 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1339 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1340 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1341 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1342 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1343 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1344 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1345
1346 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001347
1348 // Calculate common factors for intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001349 VEC_DATA_TYPE(DATA_TYPE, 8)
1350 comm_fact0 = 0.0f;
1351 VEC_DATA_TYPE(DATA_TYPE, 8)
1352 tmp0 = in_row0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001353
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001354 VEC_DATA_TYPE(DATA_TYPE, 8)
1355 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001356
1357 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1358
1359#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001360
1361 // Load the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001362 VEC_DATA_TYPE(DATA_TYPE, 8)
1363 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001364 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1365 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1366 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1367 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1368 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1369 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1370 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1371 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1372
1373 FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001374
1375 // Calculate common factors for intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001376 VEC_DATA_TYPE(DATA_TYPE, 8)
1377 comm_fact0 = 0.0f;
1378 VEC_DATA_TYPE(DATA_TYPE, 8)
1379 tmp0 = in_row0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001380
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001381 VEC_DATA_TYPE(DATA_TYPE, 8)
1382 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001383
1384 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1385#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001386 VEC_DATA_TYPE(DATA_TYPE, 8)
1387 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001388
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001389 // Row0
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001390 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1391 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1392 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1393 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1394 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1395 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1396 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1397 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001398
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001399 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001400
1401 // Row1
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001402 in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1403 in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1404 in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1405 in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1406 in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1407 in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1408 in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1409 in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001410
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001411 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001412
1413 // Row2
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001414 in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1415 in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1416 in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1417 in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1418 in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1419 in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1420 in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1421 in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001422
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001423 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001424
1425 // Row3
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001426 in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1427 in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1428 in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1429 in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1430 in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1431 in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1432 in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1433 in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001434
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001435 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001436
1437 // Row4
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001438 in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1439 in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1440 in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1441 in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1442 in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1443 in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1444 in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1445 in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001446
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001447 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001448
1449 // Row5
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001450 in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1451 in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1452 in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1453 in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1454 in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1455 in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1456 in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1457 in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001458
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001459 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001460
1461 // Row6
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001462 in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1463 in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1464 in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1465 in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1466 in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1467 in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1468 in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1469 in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001470
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001471 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001472
1473 // Row7
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001474 in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1475 in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1476 in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1477 in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1478 in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1479 in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1480 in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1481 in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001482
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001483 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001484
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001485 VEC_DATA_TYPE(DATA_TYPE, 8)
1486 comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
1487 VEC_DATA_TYPE(DATA_TYPE, 8)
1488 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
1489 VEC_DATA_TYPE(DATA_TYPE, 8)
1490 comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001491
1492 // Calculate intermediate tensor and reuse common factor vectors
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001493 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
1494 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
1495 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001496
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001497 comm_fact0 = (DATA_TYPE)2.5f * in_row3;
1498 comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001499
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001500 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
1501 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001502
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001503 comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
1504 comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001505
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001506 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
1507 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
1508 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001509
1510 // Calculate output rows (reuse comm_fact0 vector)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001511 VEC_DATA_TYPE(DATA_TYPE, 8)
1512 out0, out1, out2, out3, out4, out5, out6, out7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001513 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1514 OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
1515 OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
1516 OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
1517 OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
1518 OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
1519 OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
1520 OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001521#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1522
1523 // Store values across the channels
1524#if defined(NUM_TILES_Y)
1525 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
1526#else /* NUM_TILES_Y */
1527 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
1528#endif /* NUM_TILES_Y */
1529
1530 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1531 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1532 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1533 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1534 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1535 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1536 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1537 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
1538
1539#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1540 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1541 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1542 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1543 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1544 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1545 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1546 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1547 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1548 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1549 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1550 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1551 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1552 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1553 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1554 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1555 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1556 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1557 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1558 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1559 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1560 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1561 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1562 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1563 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1564 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1565 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1566 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1567 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1568 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1569 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1570 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1571 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1572 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1573 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1574 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1575 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1576 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1577 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1578 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1579 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1580 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1581 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1582 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1583 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1584 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1585 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1586 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1587 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1588 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1589 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1590 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1591 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1592 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1593 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1594 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1595 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
1596#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1597}
1598
1599/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
1600 *
1601 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
1602 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1603 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
1604 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1605 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1606 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1607 * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1608 * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1609 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1610 *
1611 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
1612 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1613 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1614 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1615 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1616 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1617 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1618 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1619 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1620 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1621 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1622 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1623 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1624 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1625 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1626 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1627 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1628 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1629 */
1630__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
1631 TENSOR3D_DECLARATION(src),
1632 TENSOR3D_DECLARATION(dst),
1633 uint src_stride_w,
1634 uint dst_stride_w)
1635{
1636 const int x = get_global_id(0);
1637 const int y = get_global_id(1);
1638#if defined(NUM_TILES_Y)
1639 const int z = get_global_id(2) % NUM_TILES_Y;
1640 const int b = get_global_id(2) / NUM_TILES_Y;
1641#else /* defined(NUM_TILES_Y) */
1642 const int z = get_global_id(2);
1643#endif /* defined(NUM_TILES_Y) */
1644
1645 // Compute input address
1646#if defined(NUM_TILES_Y)
1647 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
1648#else /* defined(NUM_TILES_Y) */
1649 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
1650#endif /* defined(NUM_TILES_Y) */
1651
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001652 // Origin coordinates for the width (y) and height (z) in the input tensor
1653 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1654 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1655
1656 // Coordinates to use to avoid out-of-bound reads
1657 int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
1658 int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
1659
1660 // Boundary conditions
1661 int8 y_cond0 = y_coord_valid0 == y_coord0;
1662 int8 z_cond0 = z_coord_valid0 == z_coord0;
1663
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001664#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1665
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001666 // Load the input tile
1667 VEC_DATA_TYPE(DATA_TYPE, 8)
1668 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001669 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1670 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1671 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1672 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1673 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1674 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1675 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1676 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1677
1678 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001679
1680 VEC_DATA_TYPE(DATA_TYPE, 8)
1681 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1682
1683 VEC_DATA_TYPE(DATA_TYPE, 8)
1684 tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
1685
1686 VEC_DATA_TYPE(DATA_TYPE, 8)
1687 comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1688
1689 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1690
1691#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001692 // Load the input tile
1693 VEC_DATA_TYPE(DATA_TYPE, 8)
1694 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001695 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1696 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1697 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1698 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1699 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1700 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1701 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1702 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1703
1704 FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001705
1706 // Calculate common factors for intermediate tensor
1707 VEC_DATA_TYPE(DATA_TYPE, 8)
1708 tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
1709
1710 VEC_DATA_TYPE(DATA_TYPE, 8)
1711 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1712
1713 VEC_DATA_TYPE(DATA_TYPE, 8)
1714 comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1715
1716 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1717#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1718 VEC_DATA_TYPE(DATA_TYPE, 8)
1719 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
1720
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001721 // Row0
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001722 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1723 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1724 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1725 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1726 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1727 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1728 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1729 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001730
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001731 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001732
1733 // Row1
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001734 in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1735 in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1736 in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1737 in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1738 in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1739 in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1740 in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1741 in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001742
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001743 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001744
1745 // Row2
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001746 in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1747 in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1748 in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1749 in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1750 in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1751 in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1752 in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1753 in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001754
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001755 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001756
1757 // Row3
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001758 in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1759 in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1760 in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1761 in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1762 in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1763 in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1764 in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1765 in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001766
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001767 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001768
1769 // Row4
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001770 in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1771 in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1772 in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1773 in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1774 in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1775 in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1776 in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1777 in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001778
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001779 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001780
1781 // Row5
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001782 in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1783 in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1784 in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1785 in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1786 in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1787 in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1788 in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1789 in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001790
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001791 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001792
1793 // Row6
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001794 in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1795 in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1796 in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1797 in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1798 in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1799 in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1800 in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1801 in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001802
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001803 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001804
1805 // Row7
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001806 in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1807 in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1808 in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1809 in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1810 in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1811 in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1812 in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1813 in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001814
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001815 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001816
1817 VEC_DATA_TYPE(DATA_TYPE, 8)
1818 comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
1819 VEC_DATA_TYPE(DATA_TYPE, 8)
1820 comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
1821 VEC_DATA_TYPE(DATA_TYPE, 8)
1822 comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
1823 VEC_DATA_TYPE(DATA_TYPE, 8)
1824 comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
1825 VEC_DATA_TYPE(DATA_TYPE, 8)
1826 comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
1827 VEC_DATA_TYPE(DATA_TYPE, 8)
1828 comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
1829
1830 // Calculate intermediate tensors
1831 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
1832 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
1833 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
1834 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
1835 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
1836 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
1837 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
1838 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
1839
1840 VEC_DATA_TYPE(DATA_TYPE, 8)
1841 out0, out1, out2, out3, out4, out5, out6, out7;
1842
1843 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1844 OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
1845 OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
1846 OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
1847 OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
1848 OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
1849 OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
1850 OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
1851
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001852#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001853
1854 // Store values across the channels
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001855#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001856 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001857#else /* NUM_TILES_Y */
1858 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
1859#endif /* NUM_TILES_Y */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001860
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001861 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1862 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1863 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1864 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1865 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1866 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1867 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1868 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001869
1870#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001871 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1872 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1873 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1874 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1875 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1876 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1877 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1878 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1879 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1880 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1881 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1882 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1883 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1884 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1885 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1886 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1887 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1888 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1889 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1890 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1891 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1892 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1893 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1894 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1895 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1896 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1897 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1898 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1899 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1900 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1901 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1902 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1903 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1904 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1905 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1906 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1907 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1908 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1909 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1910 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1911 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1912 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1913 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1914 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1915 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1916 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1917 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1918 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1919 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1920 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1921 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1922 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1923 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1924 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1925 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1926 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001927#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001928}
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001929#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001930
1931#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1932/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
1933 *
1934 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1935 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1936 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1937 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1938 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001939 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001940 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001941 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001942 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1943 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1944 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1945 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1946 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1947 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1948 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1949 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1950 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1951 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1952 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1953 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1954 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1955 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1956 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001957 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1958 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001959 */
1960__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
1961 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001962 TENSOR3D_DECLARATION(dst),
1963 uint src_stride_w,
1964 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001965{
1966 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
1967 src_stride_x,
1968 src_step_x,
1969 src_stride_y,
1970 src_step_y,
1971 src_stride_z,
1972 src_step_z,
1973 src_offset_first_element_in_bytes,
1974 dst_ptr,
1975 dst_stride_x,
1976 dst_step_x,
1977 dst_stride_y,
1978 dst_step_y,
1979 dst_stride_z,
1980 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001981 dst_offset_first_element_in_bytes,
1982 src_stride_w,
1983 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001984}
1985
1986/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
1987 *
1988 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1989 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1990 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1991 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1992 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001993 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001994 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001995 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001996 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1997 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1998 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1999 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2000 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2001 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2002 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2003 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2004 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2005 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2006 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2007 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2008 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2009 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2010 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002011 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2012 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002013 */
2014__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
2015 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002016 TENSOR3D_DECLARATION(dst),
2017 uint src_stride_w,
2018 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002019{
2020 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2021 src_stride_x,
2022 src_step_x,
2023 src_stride_y,
2024 src_step_y,
2025 src_stride_z,
2026 src_step_z,
2027 src_offset_first_element_in_bytes,
2028 dst_ptr,
2029 dst_stride_x,
2030 dst_step_x,
2031 dst_stride_y,
2032 dst_step_y,
2033 dst_stride_z,
2034 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002035 dst_offset_first_element_in_bytes,
2036 src_stride_w,
2037 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002038}
2039
2040/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
2041 *
2042 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2043 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2044 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2045 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2046 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002047 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002048 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002049 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002050 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2051 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2052 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2053 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2054 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2055 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2056 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2057 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2058 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2059 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2060 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2061 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2062 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2063 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2064 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002065 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2066 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002067 */
2068__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
2069 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002070 TENSOR3D_DECLARATION(dst),
2071 uint src_stride_w,
2072 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002073{
2074 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2075 src_stride_x,
2076 src_step_x,
2077 src_stride_y,
2078 src_step_y,
2079 src_stride_z,
2080 src_step_z,
2081 src_offset_first_element_in_bytes,
2082 dst_ptr,
2083 dst_stride_x,
2084 dst_step_x,
2085 dst_stride_y,
2086 dst_step_y,
2087 dst_stride_z,
2088 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002089 dst_offset_first_element_in_bytes,
2090 src_stride_w,
2091 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002092}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002093
2094/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
2095 *
2096 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2097 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2098 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
2099 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2100 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002101 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002102 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002103 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002104 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2105 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2106 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2107 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2108 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2109 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2110 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2111 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2112 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2113 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2114 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2115 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2116 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2117 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2118 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002119 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2120 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002121 */
2122__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
2123 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002124 TENSOR3D_DECLARATION(dst),
2125 uint src_stride_w,
2126 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002127{
2128 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2129 src_stride_x,
2130 src_step_x,
2131 src_stride_y,
2132 src_step_y,
2133 src_stride_z,
2134 src_step_z,
2135 src_offset_first_element_in_bytes,
2136 dst_ptr,
2137 dst_stride_x,
2138 dst_step_x,
2139 dst_stride_y,
2140 dst_step_y,
2141 dst_stride_z,
2142 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002143 dst_offset_first_element_in_bytes,
2144 src_stride_w,
2145 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002146}
2147
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002148#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002149/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
2150 *
2151 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2152 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2153 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2154 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2155 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2156 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2157 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002158 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002159 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002160 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002161 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2162 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2163 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2164 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2165 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2166 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2167 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2168 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2169 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2170 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2171 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2172 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2173 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2174 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2175 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002176 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2177 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002178 */
2179__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
2180 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002181 TENSOR3D_DECLARATION(dst),
2182 uint src_stride_w,
2183 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002184{
2185 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2186 src_stride_x,
2187 src_step_x,
2188 src_stride_y,
2189 src_step_y,
2190 src_stride_z,
2191 src_step_z,
2192 src_offset_first_element_in_bytes,
2193 dst_ptr,
2194 dst_stride_x,
2195 dst_step_x,
2196 dst_stride_y,
2197 dst_step_y,
2198 dst_stride_z,
2199 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002200 dst_offset_first_element_in_bytes,
2201 src_stride_w,
2202 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002203}
2204
2205/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
2206 *
2207 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2208 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2209 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2210 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2211 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2212 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2213 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002214 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002215 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002216 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002217 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2218 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2219 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2220 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2221 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2222 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2223 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2224 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2225 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2226 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2227 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2228 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2229 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2230 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2231 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002232 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2233 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002234 */
2235__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
2236 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002237 TENSOR3D_DECLARATION(dst),
2238 uint src_stride_w,
2239 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002240{
2241 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2242 src_stride_x,
2243 src_step_x,
2244 src_stride_y,
2245 src_step_y,
2246 src_stride_z,
2247 src_step_z,
2248 src_offset_first_element_in_bytes,
2249 dst_ptr,
2250 dst_stride_x,
2251 dst_step_x,
2252 dst_stride_y,
2253 dst_step_y,
2254 dst_stride_z,
2255 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002256 dst_offset_first_element_in_bytes,
2257 src_stride_w,
2258 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002259}
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002260
2261/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
2262 *
2263 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
2264 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2265 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2266 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2267 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
2268 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2269 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
2270 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
2271 *
2272 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
2273 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2274 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2275 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2276 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2277 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2278 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2279 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2280 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2281 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2282 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2283 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2284 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2285 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2286 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2287 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2288 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2289 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
2290 */
2291__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
2292 TENSOR3D_DECLARATION(src),
2293 TENSOR3D_DECLARATION(dst),
2294 uint src_stride_w,
2295 uint dst_stride_w)
2296{
2297 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2298 src_stride_x,
2299 src_step_x,
2300 src_stride_y,
2301 src_step_y,
2302 src_stride_z,
2303 src_step_z,
2304 src_offset_first_element_in_bytes,
2305 dst_ptr,
2306 dst_stride_x,
2307 dst_step_x,
2308 dst_stride_y,
2309 dst_step_y,
2310 dst_stride_z,
2311 dst_step_z,
2312 dst_offset_first_element_in_bytes,
2313 src_stride_w,
2314 dst_stride_w);
2315}
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002316#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002317#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
2318
2319#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
2320/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
2321 *
2322 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2323 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2324 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2325 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2326 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002327 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002328 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002329 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002330 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2331 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2332 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2333 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2334 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2335 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2336 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2337 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2338 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2339 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2340 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2341 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2342 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2343 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2344 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002345 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2346 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002347 */
2348__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
2349 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002350 TENSOR3D_DECLARATION(dst),
2351 uint src_stride_w,
2352 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002353{
2354 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2355 src_stride_x,
2356 src_step_x,
2357 src_stride_y,
2358 src_step_y,
2359 src_stride_z,
2360 src_step_z,
2361 src_offset_first_element_in_bytes,
2362 dst_ptr,
2363 dst_stride_x,
2364 dst_step_x,
2365 dst_stride_y,
2366 dst_step_y,
2367 dst_stride_z,
2368 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002369 dst_offset_first_element_in_bytes,
2370 src_stride_w,
2371 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002372}
2373
2374/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
2375 *
2376 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2377 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2378 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2379 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2380 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002381 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002382 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002383 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002384 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2385 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2386 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2387 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2388 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2389 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2390 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2391 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2392 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2393 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2394 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2395 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2396 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2397 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2398 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002399 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2400 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002401 */
2402__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
2403 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002404 TENSOR3D_DECLARATION(dst),
2405 uint src_stride_w,
2406 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002407{
2408 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2409 src_stride_x,
2410 src_step_x,
2411 src_stride_y,
2412 src_step_y,
2413 src_stride_z,
2414 src_step_z,
2415 src_offset_first_element_in_bytes,
2416 dst_ptr,
2417 dst_stride_x,
2418 dst_step_x,
2419 dst_stride_y,
2420 dst_step_y,
2421 dst_stride_z,
2422 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002423 dst_offset_first_element_in_bytes,
2424 src_stride_w,
2425 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002426}
2427
2428/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
2429 *
2430 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2431 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2432 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2433 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2434 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002435 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002436 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002437 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002438 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2439 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2440 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2441 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2442 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2443 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2444 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2445 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2446 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2447 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2448 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2449 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2450 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2451 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2452 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002453 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2454 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002455 */
2456__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
2457 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002458 TENSOR3D_DECLARATION(dst),
2459 uint src_stride_w,
2460 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002461{
2462 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2463 src_stride_x,
2464 src_step_x,
2465 src_stride_y,
2466 src_step_y,
2467 src_stride_z,
2468 src_step_z,
2469 src_offset_first_element_in_bytes,
2470 dst_ptr,
2471 dst_stride_x,
2472 dst_step_x,
2473 dst_stride_y,
2474 dst_step_y,
2475 dst_stride_z,
2476 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002477 dst_offset_first_element_in_bytes,
2478 src_stride_w,
2479 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002480}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002481
2482/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
2483 *
2484 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2485 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2486 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2487 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2488 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002489 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002490 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002491 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002492 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2493 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2494 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2495 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2496 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2497 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2498 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2499 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2500 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2501 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2502 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2503 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2504 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2505 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2506 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002507 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2508 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002509 */
2510__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
2511 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002512 TENSOR3D_DECLARATION(dst),
2513 uint src_stride_w,
2514 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002515{
2516 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2517 src_stride_x,
2518 src_step_x,
2519 src_stride_y,
2520 src_step_y,
2521 src_stride_z,
2522 src_step_z,
2523 src_offset_first_element_in_bytes,
2524 dst_ptr,
2525 dst_stride_x,
2526 dst_step_x,
2527 dst_stride_y,
2528 dst_step_y,
2529 dst_stride_z,
2530 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002531 dst_offset_first_element_in_bytes,
2532 src_stride_w,
2533 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002534}
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002535
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002536#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002537/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002538 *
2539 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002540 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2541 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002542 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002543 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002544 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002545 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002546 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002547 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002548 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002549 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2550 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2551 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2552 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2553 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2554 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2555 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2556 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2557 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2558 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2559 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2560 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2561 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2562 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2563 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002564 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2565 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002566 */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002567__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002568 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002569 TENSOR3D_DECLARATION(dst),
2570 uint src_stride_w,
2571 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002572{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002573 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2574 src_stride_x,
2575 src_step_x,
2576 src_stride_y,
2577 src_step_y,
2578 src_stride_z,
2579 src_step_z,
2580 src_offset_first_element_in_bytes,
2581 dst_ptr,
2582 dst_stride_x,
2583 dst_step_x,
2584 dst_stride_y,
2585 dst_step_y,
2586 dst_stride_z,
2587 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002588 dst_offset_first_element_in_bytes,
2589 src_stride_w,
2590 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002591}
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002592
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002593/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
2594 *
2595 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2596 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2597 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2598 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2599 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2600 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2601 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002602 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002603 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002604 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002605 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2606 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2607 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2608 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2609 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2610 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2611 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2612 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2613 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2614 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2615 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2616 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2617 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2618 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2619 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002620 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2621 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002622 */
2623__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
2624 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002625 TENSOR3D_DECLARATION(dst),
2626 uint src_stride_w,
2627 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002628{
2629 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2630 src_stride_x,
2631 src_step_x,
2632 src_stride_y,
2633 src_step_y,
2634 src_stride_z,
2635 src_step_z,
2636 src_offset_first_element_in_bytes,
2637 dst_ptr,
2638 dst_stride_x,
2639 dst_step_x,
2640 dst_stride_y,
2641 dst_step_y,
2642 dst_stride_z,
2643 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002644 dst_offset_first_element_in_bytes,
2645 src_stride_w,
2646 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002647}
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002648
2649/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
2650 *
2651 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
2652 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2653 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2654 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2655 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2656 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
2657 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
2658 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
2659 *
2660 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
2661 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2662 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2663 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2664 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2665 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2666 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2667 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2668 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2669 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2670 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2671 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2672 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2673 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2674 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2675 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2676 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2677 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
2678 */
2679__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
2680 TENSOR3D_DECLARATION(src),
2681 TENSOR3D_DECLARATION(dst),
2682 uint src_stride_w,
2683 uint dst_stride_w)
2684{
2685 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2686 src_stride_x,
2687 src_step_x,
2688 src_stride_y,
2689 src_step_y,
2690 src_stride_z,
2691 src_step_z,
2692 src_offset_first_element_in_bytes,
2693 dst_ptr,
2694 dst_stride_x,
2695 dst_step_x,
2696 dst_stride_y,
2697 dst_step_y,
2698 dst_stride_z,
2699 dst_step_z,
2700 dst_offset_first_element_in_bytes,
2701 src_stride_w,
2702 dst_stride_w);
2703}
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002704#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002705#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002706#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)