blob: 5e5b737785bd3f8f6bd82b5ead95cdb1a765e52b [file] [log] [blame]
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001/*
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01002 * Copyright (c) 2018-2020 Arm Limited.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Giorgio Arena2d1a8352020-10-26 15:04:08 +000026#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond) \
27 ({ \
28 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
29 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
30 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
31 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
32 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \
33 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010034 })
35
Giorgio Arena2d1a8352020-10-26 15:04:08 +000036#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond) \
37 ({ \
38 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
39 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
40 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
41 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
42 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \
43 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010044 })
45
Giorgio Arena2d1a8352020-10-26 15:04:08 +000046#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond) \
47 ({ \
48 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
49 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
50 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
51 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
52 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \
53 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \
54 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \
55 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010056 })
57
Giorgio Arena2d1a8352020-10-26 15:04:08 +000058#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond) \
59 ({ \
60 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
61 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
62 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
63 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
64 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \
65 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \
66 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \
67 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +010068 })
69
Gian Marco Iodiced28b7512018-07-06 12:59:28 +010070#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
71 ({ \
72 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
73 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
74 comm_fact.s2 = 2.5f * tmp.s3; \
75 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
76 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
77 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
78 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
79 \
80 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
81 out.s1 = comm_fact.s0 + comm_fact.s1; \
82 out.s2 = comm_fact.s0 - comm_fact.s1; \
83 out.s3 = comm_fact.s3 + comm_fact.s4; \
84 out.s4 = comm_fact.s4 - comm_fact.s3; \
85 out.s5 = comm_fact.s5 + comm_fact.s6; \
86 out.s6 = comm_fact.s5 - comm_fact.s6; \
87 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
88 })
89
Michele Di Giorgiof955d512019-02-27 14:26:51 +000090#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
91 ({ \
92 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
93 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
94 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
95 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
96 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
97 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
98 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
99 out.s1 = comm_fact.s0 - comm_fact.s1; \
100 out.s2 = comm_fact.s0 + comm_fact.s1; \
101 out.s3 = comm_fact.s2 - comm_fact.s3; \
102 out.s4 = comm_fact.s2 + comm_fact.s3; \
103 out.s5 = comm_fact.s4 - comm_fact.s5; \
104 out.s6 = comm_fact.s4 + comm_fact.s5; \
105 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
106 })
107
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100108#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
109/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
110 *
111 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
112 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
113 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
114 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
115 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
116 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100117 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100118 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100119 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100120 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
121 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
122 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
123 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
124 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
125 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
126 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
127 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
128 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
129 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
130 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
131 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
132 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
133 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
134 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100135 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
136 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100137 */
138__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
139 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100140 TENSOR3D_DECLARATION(dst),
141 uint src_stride_w,
142 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100143{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100144 const int x = get_global_id(0);
145 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000146#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100147 const int z = get_global_id(2) % SRC_DEPTH;
148 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000149#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000150 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000151#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100152
153 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000154#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100155 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000156#else /* defined(SRC_DEPTH) */
157 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
158#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100159
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100160 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100161
162#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100163 VEC_DATA_TYPE(DATA_TYPE, 4)
164 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100165#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100166 VEC_DATA_TYPE(DATA_TYPE, 4)
167 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
168 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
169 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
170 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100171#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100172 VEC_DATA_TYPE(DATA_TYPE, 4)
173 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
174 VEC_DATA_TYPE(DATA_TYPE, 4)
175 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
176 VEC_DATA_TYPE(DATA_TYPE, 4)
177 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
178 VEC_DATA_TYPE(DATA_TYPE, 4)
179 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100180#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
181
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100182 VEC_DATA_TYPE(DATA_TYPE, 4)
183 tmp0 = in_row0;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100184
185#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
186 tmp0 -= in_row2;
187#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
188
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100189 DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
190 DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
191 DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
192 DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100193
194#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100195 VEC_DATA_TYPE(DATA_TYPE, 4)
196 tmp1 = in_row1 + in_row2;
197 VEC_DATA_TYPE(DATA_TYPE, 4)
198 tmp2 = in_row2 - in_row1;
199 VEC_DATA_TYPE(DATA_TYPE, 4)
200 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100201
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100202 DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
203 DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
204 DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
205 DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100206
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100207 DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
208 DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
209 DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
210 DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100211
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100212 DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
213 DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
214 DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
215 DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100216#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
217
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000218#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100219 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000220#else /* defined(SRC_DEPTH) */
221 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
222#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100223
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100224 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
225 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
226 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
227 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100228
229#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100230 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
231 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
232 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
233 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
234 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
235 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
236 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
237 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
238 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
239 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
240 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
241 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100242#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
243}
244
245/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
246 *
247 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
248 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
249 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
250 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
251 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
252 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100253 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100254 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100255 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100256 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
257 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
258 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
259 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
260 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
261 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
262 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
263 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
264 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
265 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
266 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
267 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
268 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
269 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
270 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100271 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
272 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100273 */
274__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
275 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100276 TENSOR3D_DECLARATION(dst),
277 uint src_stride_w,
278 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100279{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100280 const int x = get_global_id(0);
281 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000282#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100283 const int z = (get_global_id(2) * 2) % SRC_DEPTH;
284 const int b = (get_global_id(2) * 2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000285#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000286 const int z = get_global_id(2) * 2;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000287#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100288
289 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000290#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100291 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000292#else /* defined(SRC_DEPTH) */
293 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
294#endif /* defined(SRC_DEPTH) */
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100295 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100296
297#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100298 VEC_DATA_TYPE(DATA_TYPE, 4)
299 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100300#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100301 VEC_DATA_TYPE(DATA_TYPE, 4)
302 in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
303 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
304 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
305 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100306#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100307 VEC_DATA_TYPE(DATA_TYPE, 4)
308 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
309 VEC_DATA_TYPE(DATA_TYPE, 4)
310 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
311 VEC_DATA_TYPE(DATA_TYPE, 4)
312 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
313 VEC_DATA_TYPE(DATA_TYPE, 4)
314 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100315#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
316
317 src_addr += src_stride_z;
318#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100319 VEC_DATA_TYPE(DATA_TYPE, 4)
320 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100321#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100322 VEC_DATA_TYPE(DATA_TYPE, 4)
323 in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
324 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
325 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
326 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100327#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100328 VEC_DATA_TYPE(DATA_TYPE, 4)
329 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
330 VEC_DATA_TYPE(DATA_TYPE, 4)
331 in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
332 VEC_DATA_TYPE(DATA_TYPE, 4)
333 in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
334 VEC_DATA_TYPE(DATA_TYPE, 4)
335 in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100336#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
337
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100338 VEC_DATA_TYPE(DATA_TYPE, 4)
339 tmp0 = in_row0;
340 VEC_DATA_TYPE(DATA_TYPE, 4)
341 tmp4 = in_row4;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100342
343#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
344 tmp0 -= in_row2;
345 tmp4 -= in_row6;
346#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
347
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100348 VEC_DATA_TYPE(DATA_TYPE, 2)
349 out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
350 VEC_DATA_TYPE(DATA_TYPE, 2)
351 out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
352 VEC_DATA_TYPE(DATA_TYPE, 2)
353 out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
354 VEC_DATA_TYPE(DATA_TYPE, 2)
355 out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100356
357#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100358 VEC_DATA_TYPE(DATA_TYPE, 4)
359 tmp1 = in_row1 + in_row2;
360 VEC_DATA_TYPE(DATA_TYPE, 4)
361 tmp2 = in_row2 - in_row1;
362 VEC_DATA_TYPE(DATA_TYPE, 4)
363 tmp3 = in_row1 - in_row3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100364
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100365 VEC_DATA_TYPE(DATA_TYPE, 4)
366 tmp5 = in_row5 + in_row6;
367 VEC_DATA_TYPE(DATA_TYPE, 4)
368 tmp6 = in_row6 - in_row5;
369 VEC_DATA_TYPE(DATA_TYPE, 4)
370 tmp7 = in_row5 - in_row7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100371
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100372 VEC_DATA_TYPE(DATA_TYPE, 2)
373 out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
374 VEC_DATA_TYPE(DATA_TYPE, 2)
375 out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
376 VEC_DATA_TYPE(DATA_TYPE, 2)
377 out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
378 VEC_DATA_TYPE(DATA_TYPE, 2)
379 out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100380
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100381 VEC_DATA_TYPE(DATA_TYPE, 2)
382 out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
383 VEC_DATA_TYPE(DATA_TYPE, 2)
384 out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
385 VEC_DATA_TYPE(DATA_TYPE, 2)
386 out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
387 VEC_DATA_TYPE(DATA_TYPE, 2)
388 out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100389
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100390 VEC_DATA_TYPE(DATA_TYPE, 2)
391 out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
392 VEC_DATA_TYPE(DATA_TYPE, 2)
393 out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
394 VEC_DATA_TYPE(DATA_TYPE, 2)
395 out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
396 VEC_DATA_TYPE(DATA_TYPE, 2)
397 out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100398#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
399
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000400#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100401 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000402#else /* defined(SRC_DEPTH) */
403 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
404#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100405
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100406 vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
407 vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
408 vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
409 vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100410
411#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100412 vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
413 vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
414 vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
415 vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
416 vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
417 vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
418 vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
419 vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
420 vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
421 vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
422 vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
423 vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100424#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
425}
426
Gian Marco Iodice876be2a2018-07-03 12:22:09 +0100427/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100428 *
429 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
430 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
431 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
432 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
433 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
434 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100435 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100436 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100437 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100438 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
439 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
440 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
441 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
442 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
443 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
444 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
445 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
446 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
447 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
448 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
449 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
450 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
451 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
452 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100453 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
454 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100455 */
456__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
457 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100458 TENSOR3D_DECLARATION(dst),
459 uint src_stride_w,
460 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100461{
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100462 const int x = get_global_id(0);
463 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000464#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100465 const int z = get_global_id(2) % SRC_DEPTH;
466 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000467#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000468 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000469#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100470
471 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000472#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100473 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000474#else /* defined(SRC_DEPTH) */
475 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
476#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100477
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100478 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100479
480#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
481 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100482 VEC_DATA_TYPE(DATA_TYPE, 4)
483 d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
484 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
485 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
486 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
487 VEC_DATA_TYPE(DATA_TYPE, 2)
488 d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
489 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100490#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
491 // Row0
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100492 VEC_DATA_TYPE(DATA_TYPE, 4)
493 d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
494 VEC_DATA_TYPE(DATA_TYPE, 2)
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000495 d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100496#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
497
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100498 DATA_TYPE out0 = 0.0f;
499 DATA_TYPE out1 = 0.0f;
500 DATA_TYPE out2 = 0.0f;
501 DATA_TYPE out3 = 0.0f;
502 DATA_TYPE out4 = 0.0f;
503 DATA_TYPE out5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100504
505 // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
506 out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
507 out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
508 out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
509 out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
510 out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
511 out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
512
513#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
514 // Row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100515 VEC_DATA_TYPE(DATA_TYPE, 4)
516 d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
517 VEC_DATA_TYPE(DATA_TYPE, 2)
518 d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100519
520 // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100521 DATA_TYPE k0 = d41.s0;
522 DATA_TYPE k1 = d41.s0;
523 DATA_TYPE k2 = d41.s0;
524 DATA_TYPE k3 = d41.s0;
525 DATA_TYPE k4 = d41.s0;
526 DATA_TYPE k5 = 0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100527
528 k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
529 k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
530 k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
531 k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
532 k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
533 k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
534
535 out0 += k0;
536 out1 += k1;
537 out2 += k2;
538 out3 += k3;
539 out4 += k4;
540 out5 += k5;
541
542 // Row2
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100543 VEC_DATA_TYPE(DATA_TYPE, 4)
544 d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
545 VEC_DATA_TYPE(DATA_TYPE, 2)
546 d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100547
548 out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
549 out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
550 out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
551 out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
552 out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
553 out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
554#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
555
556 // Compute destination address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000557#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100558 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000559#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000560 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000561#endif /* defined(SRC_DEPTH) */
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100562
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100563 uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100564
565 *(dst_addr) = out0;
566 dst_addr += dst_plane_stride;
567 *(dst_addr) = out1;
568 dst_addr += dst_plane_stride;
569 *(dst_addr) = out2;
570 dst_addr += dst_plane_stride;
571 *(dst_addr) = out3;
572 dst_addr += dst_plane_stride;
573 *(dst_addr) = out4;
574 dst_addr += dst_plane_stride;
575 *(dst_addr) = out5;
576 dst_addr += dst_plane_stride;
577
578#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100579 DATA_TYPE out6 = k0;
580 DATA_TYPE out7 = k1;
581 DATA_TYPE out8 = k2;
582 DATA_TYPE out9 = k3;
583 DATA_TYPE out10 = k4;
584 DATA_TYPE out11 = k5;
585 DATA_TYPE out12 = k0;
586 DATA_TYPE out13 = k1;
587 DATA_TYPE out14 = k2;
588 DATA_TYPE out15 = k3;
589 DATA_TYPE out16 = k4;
590 DATA_TYPE out17 = k5;
591 DATA_TYPE out18 = k0;
592 DATA_TYPE out19 = k1;
593 DATA_TYPE out20 = k2;
594 DATA_TYPE out21 = k3;
595 DATA_TYPE out22 = k4;
596 DATA_TYPE out23 = k5;
597 DATA_TYPE out24 = k0;
598 DATA_TYPE out25 = k1;
599 DATA_TYPE out26 = k2;
600 DATA_TYPE out27 = k3;
601 DATA_TYPE out28 = k4;
602 DATA_TYPE out29 = k5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100603
604 // Row1
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100605 VEC_DATA_TYPE(DATA_TYPE, 4)
606 d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
607 VEC_DATA_TYPE(DATA_TYPE, 2)
608 d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100609
610 // Row3
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100611 VEC_DATA_TYPE(DATA_TYPE, 4)
612 d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
613 VEC_DATA_TYPE(DATA_TYPE, 2)
614 d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100615
616 // Compute common parts for the channels between [6, 29]
617 // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
618 // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100619 DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
620 DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
621 DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
622 DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
623 DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
624 DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
625 DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
626 DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
627 DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
628 DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
629 DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
630 DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100631
632 // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
633 // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100634 DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
635 DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
636 DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
637 DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
638 DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
639 DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
640 DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
641 DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
642 DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
643 DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
644 DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
645 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100646
647 out6 += part0 - part1;
648 out12 += part0 + part1;
649 out7 += part2 + part3 + part4 + part5;
650 out8 += part2 - part3 + part4 - part5;
651 out13 += part2 + part3 - part4 - part5;
652 out14 += part2 - part3 - part4 + part5;
653 out9 += part6 + part7 + part8 + part9;
654 out10 += part6 - part7 + part8 - part9;
655 out15 += part6 - part7 - part8 + part9;
656 out16 += part6 + part7 - part8 - part9;
657 out11 += part10 + part11;
658 out17 += part10 - part11;
659
660 out18 += part13 - part12;
661 out24 += part13 + part12;
662 out19 += part14 + part15 + part16 + part17;
663 out20 += part14 - part15 + part16 - part17;
664 out25 += part14 - part15 - part16 + part17;
665 out26 += part14 + part15 - part16 - part17;
666 out21 += part18 + part19 + part20 + part21;
667 out22 += part18 - part19 + part20 - part21;
668 out27 += part18 - part19 - part20 + part21;
669 out28 += part18 + part19 - part20 - part21;
670 out23 += part22 + part23;
671 out29 += part22 - part23;
672
673 *(dst_addr) = out6;
674 dst_addr += dst_plane_stride;
675 *(dst_addr) = out7;
676 dst_addr += dst_plane_stride;
677 *(dst_addr) = out8;
678 dst_addr += dst_plane_stride;
679 *(dst_addr) = out9;
680 dst_addr += dst_plane_stride;
681 *(dst_addr) = out10;
682 dst_addr += dst_plane_stride;
683 *(dst_addr) = out11;
684 dst_addr += dst_plane_stride;
685 *(dst_addr) = out12;
686 dst_addr += dst_plane_stride;
687 *(dst_addr) = out13;
688 dst_addr += dst_plane_stride;
689 *(dst_addr) = out14;
690 dst_addr += dst_plane_stride;
691 *(dst_addr) = out15;
692 dst_addr += dst_plane_stride;
693 *(dst_addr) = out16;
694 dst_addr += dst_plane_stride;
695 *(dst_addr) = out17;
696 dst_addr += dst_plane_stride;
697
698 *(dst_addr) = out18;
699 dst_addr += dst_plane_stride;
700 *(dst_addr) = out19;
701 dst_addr += dst_plane_stride;
702 *(dst_addr) = out20;
703 dst_addr += dst_plane_stride;
704 *(dst_addr) = out21;
705 dst_addr += dst_plane_stride;
706 *(dst_addr) = out22;
707 dst_addr += dst_plane_stride;
708 *(dst_addr) = out23;
709 dst_addr += dst_plane_stride;
710 *(dst_addr) = out24;
711 dst_addr += dst_plane_stride;
712 *(dst_addr) = out25;
713 dst_addr += dst_plane_stride;
714 *(dst_addr) = out26;
715 dst_addr += dst_plane_stride;
716 *(dst_addr) = out27;
717 dst_addr += dst_plane_stride;
718 *(dst_addr) = out28;
719 dst_addr += dst_plane_stride;
720 *(dst_addr) = out29;
721 dst_addr += dst_plane_stride;
722
723 // Row5
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100724 VEC_DATA_TYPE(DATA_TYPE, 4)
725 d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
726 VEC_DATA_TYPE(DATA_TYPE, 2)
727 d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100728
729 // Channels [30, 35]
730 out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
731 out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
732 out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
733 out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
734 out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
735 out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
736
737 *(dst_addr) = out0;
738 dst_addr += dst_plane_stride;
739 *(dst_addr) = out1;
740 dst_addr += dst_plane_stride;
741 *(dst_addr) = out2;
742 dst_addr += dst_plane_stride;
743 *(dst_addr) = out3;
744 dst_addr += dst_plane_stride;
745 *(dst_addr) = out4;
746 dst_addr += dst_plane_stride;
747 *(dst_addr) = out5;
748 dst_addr += dst_plane_stride;
749#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
750}
751
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100752/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
753 *
754 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
755 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
756 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
757 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
758 * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
759 * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
760 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
761 *
762 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
763 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
764 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
765 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
766 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
767 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
768 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
769 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
770 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
771 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
772 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
773 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
774 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
775 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
776 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
777 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
778 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
779 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
780 */
781__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
782 TENSOR3D_DECLARATION(src),
783 TENSOR3D_DECLARATION(dst),
784 uint src_stride_w,
785 uint dst_stride_w)
786{
787 const int x = get_global_id(0);
788 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000789#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100790 const int z = get_global_id(2) % SRC_DEPTH;
791 const int b = get_global_id(2) / SRC_DEPTH;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000792#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000793 const int z = get_global_id(2);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000794#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100795
796 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000797#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100798 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000799#else /* defined(SRC_DEPTH) */
Michele Di Giorgiof955d512019-02-27 14:26:51 +0000800 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000801#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100802 src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
803
804 // Load input tile
805#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
806 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
807#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
808 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
809 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
810 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
811 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
812 *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
813 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
814 *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
815 *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
816#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
817 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
818 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
819 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
820 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
821 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
822 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
823 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
824 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
825#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
826
827 // Calculate common factors for intermediate tensor
828 VEC_DATA_TYPE(DATA_TYPE, 8)
829 tmp0 = in_row0;
830 VEC_DATA_TYPE(DATA_TYPE, 8)
831 comm_fact0 = 0.0f;
832
833#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
834 comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;
835 tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;
836
837 VEC_DATA_TYPE(DATA_TYPE, 8)
838 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;
839 VEC_DATA_TYPE(DATA_TYPE, 8)
840 comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;
841
842 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
843 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
844
845 comm_fact0 = (DATA_TYPE)2.5 * in_row3;
846 comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;
847
848 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
849 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
850
851 comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;
852 comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;
853
854 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
855 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
856 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;
857#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
858
859 // Calculate output rows (reuse comm_fact0 vector)
860 VEC_DATA_TYPE(DATA_TYPE, 8)
861 out0;
862
863 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
864
865#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
866 VEC_DATA_TYPE(DATA_TYPE, 8)
867 out1, out2, out3, out4, out5, out6, out7;
868
869 OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
870 OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
871 OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
872 OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
873 OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
874 OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
875 OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
876#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
877
878 // Store values across the channels
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000879#if defined(SRC_DEPTH)
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100880 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000881#else /* defined(SRC_DEPTH) */
882 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
883#endif /* defined(SRC_DEPTH) */
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100884
885 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
886 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
887 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
888 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
889 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
890 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
891 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
892 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
893
894#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
895 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
896 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
897 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
898 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
899 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
900 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
901 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
902 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
903 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
904 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
905 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
906 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
907 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
908 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
909 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
910 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
911 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
912 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
913 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
914 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
915 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
916 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
917 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
918 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
919 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
920 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
921 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
922 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
923 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
924 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
925 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
926 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
927 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
928 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
929 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
930 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
931 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
932 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
933 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
934 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
935 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
936 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
937 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
938 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
939 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
940 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
941 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
942 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
943 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
944 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
945 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
946 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
947 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
948 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
949 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
950 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
951#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
952}
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100953
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000954#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100955/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100956 *
957 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
958 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
959 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
960 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arena149fdf32018-07-04 17:03:33 +0100961 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
962 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
963 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
964 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100965 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100966 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +0100967 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100968 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
969 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
970 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
971 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
972 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
973 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
974 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
975 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
976 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
977 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
978 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
979 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
980 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
981 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
982 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100983 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
984 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100985 */
986__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
987 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100988 TENSOR3D_DECLARATION(dst),
989 uint src_stride_w,
990 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +0100991{
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100992 // Index channel
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100993 const int x = get_global_id(0);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100994 // Index width
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100995 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +0000996#if defined(NUM_TILES_Y)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100997 // Index height
Georgios Pinitasc55beee2018-10-23 15:23:23 +0100998 const int z = get_global_id(2) % NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +0100999 // Index batch size
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001000 const int b = get_global_id(2) / NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001001#else // defined(NUM_TILES_Y)
1002 // Index height
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001003 const int z = get_global_id(2);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001004#endif // defined(NUM_TILES_Y)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001005
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001006#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001007 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001008#else // defined(NUM_TILES_Y)
1009 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
1010#endif // defined(NUM_TILES_Y)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001011
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001012 // Origin coordinates for the width (y) and height (z) in the input tensor
Giorgio Arena149fdf32018-07-04 17:03:33 +01001013 int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
1014 int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001015 int4 z_coord0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
1016 int2 z_coord1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001017
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001018 // Coordinates to use to avoid out-of-bound reads
1019 int4 y_coord_valid0 = clamp(y_coord0, (int4)0, (int4)((int)SRC_DIM_1 - 1));
1020 int2 y_coord_valid1 = clamp(y_coord1, (int2)0, (int2)((int)SRC_DIM_1 - 1));
1021 int4 z_coord_valid0 = clamp(z_coord0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
1022 int2 z_coord_valid1 = clamp(z_coord1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
1023
1024 // Boundary conditions
1025 int4 y_cond0 = y_coord_valid0 == y_coord0;
1026 int2 y_cond1 = y_coord_valid1 == y_coord1;
1027 int4 z_cond0 = z_coord_valid0 == z_coord0;
1028 int2 z_cond1 = z_coord_valid1 == z_coord1;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001029
1030#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001031
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001032 DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1033 DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1034 DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1035 DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1036 DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1037 DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001038
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001039 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d4, y_cond, z_cond1.s0);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001040
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001041 DATA_TYPE k0 = d44;
1042 DATA_TYPE k1 = d44;
1043 DATA_TYPE k2 = d44;
1044 DATA_TYPE k3 = d44;
1045 DATA_TYPE k4 = d44;
1046 DATA_TYPE k5 = (DATA_TYPE)0.0f;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001047
1048 k0 += 4.0f * d40 - 5.0f * d42;
1049 k1 += -4.0f * d41 - 4.0f * d42 + d43;
1050 k2 += 4.0f * d41 - 4.0f * d42 - d43;
1051 k3 += -2.0f * d41 + 2.0f * d43 - d42;
1052 k4 += 2.0f * d41 - 2.0f * d43 - d42;
1053 k5 += 4.0f * d41 - 5.0f * d43 + d45;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001054#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001055
Giorgio Arena149fdf32018-07-04 17:03:33 +01001056#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001057 DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1058 DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1059 DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1060 DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1061 DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1062 DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001063
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001064 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d0, y_cond, z_cond0.s0);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001065
Giorgio Arena149fdf32018-07-04 17:03:33 +01001066#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001067 DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1068 DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1069 DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1070 DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1071 DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1072 DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001073
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001074 FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(DATA_TYPE, d0, y_cond0.s0, z_cond);
Giorgio Arena149fdf32018-07-04 17:03:33 +01001075#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1076
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001077 DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
1078 DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
1079 DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
1080 DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
1081 DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
1082 DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001083
1084#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001085 DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1086 DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1087 DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1088 DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1089 DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1090 DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001091
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001092 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d2, y_cond, z_cond0.s2);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001093
Giorgio Arena149fdf32018-07-04 17:03:33 +01001094 out0 += k0;
1095 out1 += k1;
1096 out2 += k2;
1097 out3 += k3;
1098 out4 += k4;
1099 out5 += k5;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001100 DATA_TYPE out6 = k0;
1101 DATA_TYPE out7 = k1;
1102 DATA_TYPE out8 = k2;
1103 DATA_TYPE out9 = k3;
1104 DATA_TYPE out10 = k4;
1105 DATA_TYPE out11 = k5;
1106 DATA_TYPE out12 = k0;
1107 DATA_TYPE out13 = k1;
1108 DATA_TYPE out14 = k2;
1109 DATA_TYPE out15 = k3;
1110 DATA_TYPE out16 = k4;
1111 DATA_TYPE out17 = k5;
1112 DATA_TYPE out18 = k0;
1113 DATA_TYPE out19 = k1;
1114 DATA_TYPE out20 = k2;
1115 DATA_TYPE out21 = k3;
1116 DATA_TYPE out22 = k4;
1117 DATA_TYPE out23 = k5;
1118 DATA_TYPE out24 = k0;
1119 DATA_TYPE out25 = k1;
1120 DATA_TYPE out26 = k2;
1121 DATA_TYPE out27 = k3;
1122 DATA_TYPE out28 = k4;
1123 DATA_TYPE out29 = k5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001124
1125 // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
Giorgio Arena149fdf32018-07-04 17:03:33 +01001126 out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
1127 out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;
1128 out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;
1129 out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;
1130 out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;
1131 out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;
1132#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1133
1134 // Compute destination address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001135#if defined(NUM_TILES_Y)
1136 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001137#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001138 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001139#endif // defined(NUM_TILES_Y)
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001140
1141 uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001142
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001143 *((__global DATA_TYPE *)dst_addr) = out0;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001144 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001145 *((__global DATA_TYPE *)dst_addr) = out1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001146 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001147 *((__global DATA_TYPE *)dst_addr) = out2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001148 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001149 *((__global DATA_TYPE *)dst_addr) = out3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001150 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001151 *((__global DATA_TYPE *)dst_addr) = out4;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001152 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001153 *((__global DATA_TYPE *)dst_addr) = out5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001154 dst_addr += dst_plane_stride;
1155
Giorgio Arena149fdf32018-07-04 17:03:33 +01001156#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001157 DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1158 DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1159 DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1160 DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1161 DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1162 DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001163
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001164 DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1165 DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1166 DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1167 DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1168 DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1169 DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001170
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001171 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d1, y_cond, z_cond0.s1);
1172 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d3, y_cond, z_cond0.s3);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001173
1174 // Compute common parts for the channels between [6, 29]
1175 // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
1176 // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001177 DATA_TYPE part0 = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
1178 DATA_TYPE part1 = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
1179 DATA_TYPE part2 = 16.0f * d22 - 4.0f * d24;
1180 DATA_TYPE part3 = 16.0f * d21 - 4.0f * d23;
1181 DATA_TYPE part4 = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
1182 DATA_TYPE part5 = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
1183 DATA_TYPE part6 = 4.0f * d22 - 4.0f * d24;
1184 DATA_TYPE part7 = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
1185 DATA_TYPE part8 = 4.0f * d12 - 4.0f * d14 - d32 + d34;
1186 DATA_TYPE part9 = 8.0f * d21 - 8.0f * d23;
1187 DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
1188 DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001189
1190 // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
1191 // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001192 DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
1193 DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
1194 DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24
1195 DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
1196 DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
1197 DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23
1198 DATA_TYPE part18 = part6 * 0.25f; // d22 - d24
1199 DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
1200 DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
1201 DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d21 - d23)
1202 DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d21 + 5.0f * d23 - d25
1203 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001204
1205 out6 += part0 - part1;
1206 out12 += part0 + part1;
1207 out7 += part2 + part3 + part4 + part5;
1208 out8 += part2 - part3 + part4 - part5;
1209 out13 += part2 + part3 - part4 - part5;
1210 out14 += part2 - part3 - part4 + part5;
1211 out9 += part6 + part7 + part8 + part9;
1212 out10 += part6 - part7 + part8 - part9;
1213 out15 += part6 - part7 - part8 + part9;
1214 out16 += part6 + part7 - part8 - part9;
1215 out11 += part10 + part11;
1216 out17 += part10 - part11;
1217
1218 out18 += part13 - part12;
1219 out24 += part13 + part12;
1220 out19 += part14 + part15 + part16 + part17;
1221 out20 += part14 - part15 + part16 - part17;
1222 out25 += part14 - part15 - part16 + part17;
1223 out26 += part14 + part15 - part16 - part17;
1224 out21 += part18 + part19 + part20 + part21;
1225 out22 += part18 - part19 + part20 - part21;
1226 out27 += part18 - part19 - part20 + part21;
1227 out28 += part18 + part19 - part20 - part21;
1228 out23 += part22 + part23;
1229 out29 += part22 - part23;
1230
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001231 *((__global DATA_TYPE *)dst_addr) = out6;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001232 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001233 *((__global DATA_TYPE *)dst_addr) = out7;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001234 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001235 *((__global DATA_TYPE *)dst_addr) = out8;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001236 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001237 *((__global DATA_TYPE *)dst_addr) = out9;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001238 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001239 *((__global DATA_TYPE *)dst_addr) = out10;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001240 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001241 *((__global DATA_TYPE *)dst_addr) = out11;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001242 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001243 *((__global DATA_TYPE *)dst_addr) = out12;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001244 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001245 *((__global DATA_TYPE *)dst_addr) = out13;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001246 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001247 *((__global DATA_TYPE *)dst_addr) = out14;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001248 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001249 *((__global DATA_TYPE *)dst_addr) = out15;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001250 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001251 *((__global DATA_TYPE *)dst_addr) = out16;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001252 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001253 *((__global DATA_TYPE *)dst_addr) = out17;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001254 dst_addr += dst_plane_stride;
1255
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001256 *((__global DATA_TYPE *)dst_addr) = out18;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001257 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001258 *((__global DATA_TYPE *)dst_addr) = out19;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001259 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001260 *((__global DATA_TYPE *)dst_addr) = out20;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001261 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001262 *((__global DATA_TYPE *)dst_addr) = out21;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001263 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001264 *((__global DATA_TYPE *)dst_addr) = out22;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001265 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001266 *((__global DATA_TYPE *)dst_addr) = out23;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001267 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001268 *((__global DATA_TYPE *)dst_addr) = out24;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001269 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001270 *((__global DATA_TYPE *)dst_addr) = out25;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001271 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001272 *((__global DATA_TYPE *)dst_addr) = out26;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001273 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001274 *((__global DATA_TYPE *)dst_addr) = out27;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001275 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001276 *((__global DATA_TYPE *)dst_addr) = out28;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001277 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001278 *((__global DATA_TYPE *)dst_addr) = out29;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001279 dst_addr += dst_plane_stride;
1280
1281 // Row5
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001282 DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1283 DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1284 DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1285 DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1286 DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1287 DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001288
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001289 FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d5, y_cond, z_cond1.s1);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001290
1291 // Channels [30, 35]
1292 out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
1293 out1 = -16.0f * d11 - 16.0f * d12 + 4.0f * d13 + 20.0f * d31 + 20.0f * d32 - 5.0f * d33 - 4.0f * d51 - 4.0f * d52 + d53 + d54 + 4.0f * d14 - 5.0f * d34;
1294 out2 = 16.0f * d11 - 16.0f * d12 - 4.0f * d13 - 20.0f * d31 + 20.0f * d32 + 5.0f * d33 + 4.0f * d51 - 4.0f * d52 - d53 + d54 + 4.0f * d14 - 5.0f * d34;
1295 out3 = -8.0f * d11 - 4.0f * d12 + 8.0f * d13 + 10.0f * d31 - 10.0f * d33 + 5.0f * d32 - 2.0f * d51 + 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
1296 out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
1297 out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
1298
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001299 *((__global DATA_TYPE *)dst_addr) = out0;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001300 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001301 *((__global DATA_TYPE *)dst_addr) = out1;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001302 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001303 *((__global DATA_TYPE *)dst_addr) = out2;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001304 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001305 *((__global DATA_TYPE *)dst_addr) = out3;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001306 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001307 *((__global DATA_TYPE *)dst_addr) = out4;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001308 dst_addr += dst_plane_stride;
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001309 *((__global DATA_TYPE *)dst_addr) = out5;
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001310 dst_addr += dst_plane_stride;
Giorgio Arena149fdf32018-07-04 17:03:33 +01001311#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001312}
1313
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001314/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
Giorgio Arena149fdf32018-07-04 17:03:33 +01001315 *
1316 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1317 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001318 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
1319 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001320 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
Giorgio Arena149fdf32018-07-04 17:03:33 +01001321 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001322 * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1323 * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001324 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arena149fdf32018-07-04 17:03:33 +01001325 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001326 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arena149fdf32018-07-04 17:03:33 +01001327 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1328 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1329 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1330 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1331 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1332 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1333 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1334 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1335 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1336 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1337 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1338 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1339 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1340 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1341 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001342 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1343 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001344 */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001345__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
Giorgio Arena149fdf32018-07-04 17:03:33 +01001346 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001347 TENSOR3D_DECLARATION(dst),
1348 uint src_stride_w,
1349 uint dst_stride_w)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001350{
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001351 const int x = get_global_id(0);
1352 const int y = get_global_id(1);
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001353#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001354 const int z = get_global_id(2) % NUM_TILES_Y;
1355 const int b = get_global_id(2) / NUM_TILES_Y;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001356#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001357 const int z = get_global_id(2);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001358#endif // defined(NUM_TILES_Y)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001359
1360 // Compute input address
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001361#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001362 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001363#else // defined(NUM_TILES_Y)
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001364 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001365#endif // defined(NUM_TILES_Y)
1366
1367 // Origin coordinates for the width (y) and height (z) in the input tensor
1368 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1369 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1370
1371 // Coordinates to use to avoid out-of-bound reads
1372 int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
1373 int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
1374
1375 // Boundary conditions
1376 int8 y_cond0 = y_coord_valid0 == y_coord0;
1377 int8 z_cond0 = z_coord_valid0 == z_coord0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001378
1379#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001380
1381 // Load the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001382 VEC_DATA_TYPE(DATA_TYPE, 8)
1383 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001384 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1385 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1386 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1387 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1388 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1389 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1390 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1391 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1392
1393 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001394
1395 // Calculate common factors for intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001396 VEC_DATA_TYPE(DATA_TYPE, 8)
1397 comm_fact0 = 0.0f;
1398 VEC_DATA_TYPE(DATA_TYPE, 8)
1399 tmp0 = in_row0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001400
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001401 VEC_DATA_TYPE(DATA_TYPE, 8)
1402 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001403
1404 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1405
1406#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001407
1408 // Load the input tile
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001409 VEC_DATA_TYPE(DATA_TYPE, 8)
1410 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001411 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1412 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1413 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1414 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1415 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1416 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1417 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1418 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1419
1420 FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001421
1422 // Calculate common factors for intermediate tensor
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001423 VEC_DATA_TYPE(DATA_TYPE, 8)
1424 comm_fact0 = 0.0f;
1425 VEC_DATA_TYPE(DATA_TYPE, 8)
1426 tmp0 = in_row0;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001427
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001428 VEC_DATA_TYPE(DATA_TYPE, 8)
1429 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001430
1431 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1432#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001433 VEC_DATA_TYPE(DATA_TYPE, 8)
1434 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001435
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001436 // Row0
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001437 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1438 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1439 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1440 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1441 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1442 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1443 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1444 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001445
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001446 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001447
1448 // Row1
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001449 in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1450 in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1451 in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1452 in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1453 in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1454 in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1455 in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1456 in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001457
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001458 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001459
1460 // Row2
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001461 in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1462 in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1463 in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1464 in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1465 in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1466 in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1467 in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1468 in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001469
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001470 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001471
1472 // Row3
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001473 in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1474 in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1475 in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1476 in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1477 in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1478 in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1479 in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1480 in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001481
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001482 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001483
1484 // Row4
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001485 in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1486 in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1487 in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1488 in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1489 in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1490 in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1491 in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1492 in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001493
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001494 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001495
1496 // Row5
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001497 in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1498 in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1499 in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1500 in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1501 in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1502 in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1503 in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1504 in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001505
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001506 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001507
1508 // Row6
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001509 in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1510 in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1511 in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1512 in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1513 in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1514 in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1515 in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1516 in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001517
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001518 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001519
1520 // Row7
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001521 in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1522 in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1523 in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1524 in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1525 in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1526 in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1527 in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1528 in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001529
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001530 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001531
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001532 VEC_DATA_TYPE(DATA_TYPE, 8)
1533 comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
1534 VEC_DATA_TYPE(DATA_TYPE, 8)
1535 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
1536 VEC_DATA_TYPE(DATA_TYPE, 8)
1537 comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001538
1539 // Calculate intermediate tensor and reuse common factor vectors
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001540 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
1541 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
1542 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001543
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001544 comm_fact0 = (DATA_TYPE)2.5f * in_row3;
1545 comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001546
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001547 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
1548 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001549
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001550 comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
1551 comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001552
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001553 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
1554 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
1555 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001556
1557 // Calculate output rows (reuse comm_fact0 vector)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001558 VEC_DATA_TYPE(DATA_TYPE, 8)
1559 out0, out1, out2, out3, out4, out5, out6, out7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001560 OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
1561 OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
1562 OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
1563 OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
1564 OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
1565 OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
1566 OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
1567 OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001568#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1569
1570 // Store values across the channels
1571#if defined(NUM_TILES_Y)
1572 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
1573#else /* NUM_TILES_Y */
1574 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
1575#endif /* NUM_TILES_Y */
1576
1577 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1578 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1579 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1580 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1581 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1582 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1583 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1584 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
1585
1586#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1587 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1588 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1589 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1590 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1591 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1592 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1593 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1594 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1595 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1596 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1597 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1598 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1599 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1600 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1601 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1602 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1603 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1604 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1605 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1606 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1607 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1608 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1609 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1610 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1611 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1612 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1613 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1614 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1615 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1616 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1617 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1618 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1619 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1620 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1621 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1622 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1623 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1624 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1625 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1626 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1627 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1628 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1629 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1630 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1631 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1632 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1633 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1634 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1635 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1636 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1637 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1638 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1639 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1640 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1641 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1642 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
1643#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1644}
1645
1646/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
1647 *
1648 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
1649 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1650 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
1651 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1652 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1653 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
1654 * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
1655 * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
1656 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1657 *
1658 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
1659 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1660 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1661 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1662 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1663 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1664 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1665 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1666 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1667 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1668 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1669 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1670 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1671 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1672 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1673 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1674 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1675 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1676 */
1677__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
1678 TENSOR3D_DECLARATION(src),
1679 TENSOR3D_DECLARATION(dst),
1680 uint src_stride_w,
1681 uint dst_stride_w)
1682{
1683 const int x = get_global_id(0);
1684 const int y = get_global_id(1);
1685#if defined(NUM_TILES_Y)
1686 const int z = get_global_id(2) % NUM_TILES_Y;
1687 const int b = get_global_id(2) / NUM_TILES_Y;
1688#else /* defined(NUM_TILES_Y) */
1689 const int z = get_global_id(2);
1690#endif /* defined(NUM_TILES_Y) */
1691
1692 // Compute input address
1693#if defined(NUM_TILES_Y)
1694 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
1695#else /* defined(NUM_TILES_Y) */
1696 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
1697#endif /* defined(NUM_TILES_Y) */
1698
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001699 // Origin coordinates for the width (y) and height (z) in the input tensor
1700 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1701 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1702
1703 // Coordinates to use to avoid out-of-bound reads
1704 int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
1705 int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
1706
1707 // Boundary conditions
1708 int8 y_cond0 = y_coord_valid0 == y_coord0;
1709 int8 z_cond0 = z_coord_valid0 == z_coord0;
1710
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001711#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1712
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001713 // Load the input tile
1714 VEC_DATA_TYPE(DATA_TYPE, 8)
1715 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001716 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1717 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1718 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1719 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1720 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1721 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1722 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1723 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1724
1725 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001726
1727 VEC_DATA_TYPE(DATA_TYPE, 8)
1728 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1729
1730 VEC_DATA_TYPE(DATA_TYPE, 8)
1731 tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
1732
1733 VEC_DATA_TYPE(DATA_TYPE, 8)
1734 comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1735
1736 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1737
1738#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001739 // Load the input tile
1740 VEC_DATA_TYPE(DATA_TYPE, 8)
1741 in_row0;
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001742 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1743 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1744 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1745 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1746 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1747 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1748 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1749 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1750
1751 FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001752
1753 // Calculate common factors for intermediate tensor
1754 VEC_DATA_TYPE(DATA_TYPE, 8)
1755 tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
1756
1757 VEC_DATA_TYPE(DATA_TYPE, 8)
1758 out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1759
1760 VEC_DATA_TYPE(DATA_TYPE, 8)
1761 comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
1762
1763 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1764#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1765 VEC_DATA_TYPE(DATA_TYPE, 8)
1766 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
1767
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001768 // Row0
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001769 in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1770 in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1771 in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1772 in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1773 in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1774 in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1775 in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1776 in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001777
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001778 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001779
1780 // Row1
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001781 in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1782 in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1783 in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1784 in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1785 in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1786 in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1787 in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1788 in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001789
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001790 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001791
1792 // Row2
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001793 in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1794 in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1795 in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1796 in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1797 in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1798 in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1799 in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1800 in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001801
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001802 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001803
1804 // Row3
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001805 in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1806 in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1807 in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1808 in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1809 in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1810 in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1811 in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1812 in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001813
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001814 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001815
1816 // Row4
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001817 in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1818 in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1819 in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1820 in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1821 in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1822 in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1823 in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1824 in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001825
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001826 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001827
1828 // Row5
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001829 in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1830 in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1831 in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1832 in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1833 in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1834 in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1835 in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1836 in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001837
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001838 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001839
1840 // Row6
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001841 in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1842 in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1843 in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1844 in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1845 in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1846 in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1847 in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1848 in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001849
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001850 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001851
1852 // Row7
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001853 in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1854 in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1855 in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1856 in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1857 in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1858 in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1859 in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1860 in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001861
Gian Marco Iodicebc6c3742020-10-19 12:49:44 +01001862 FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
Michele Di Giorgiof955d512019-02-27 14:26:51 +00001863
1864 VEC_DATA_TYPE(DATA_TYPE, 8)
1865 comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
1866 VEC_DATA_TYPE(DATA_TYPE, 8)
1867 comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
1868 VEC_DATA_TYPE(DATA_TYPE, 8)
1869 comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
1870 VEC_DATA_TYPE(DATA_TYPE, 8)
1871 comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
1872 VEC_DATA_TYPE(DATA_TYPE, 8)
1873 comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
1874 VEC_DATA_TYPE(DATA_TYPE, 8)
1875 comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
1876
1877 // Calculate intermediate tensors
1878 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
1879 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
1880 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
1881 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
1882 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
1883 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
1884 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
1885 const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
1886
1887 VEC_DATA_TYPE(DATA_TYPE, 8)
1888 out0, out1, out2, out3, out4, out5, out6, out7;
1889
1890 OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
1891 OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
1892 OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
1893 OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
1894 OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
1895 OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
1896 OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
1897 OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
1898
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001899#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001900
1901 // Store values across the channels
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001902#if defined(NUM_TILES_Y)
Georgios Pinitasc55beee2018-10-23 15:23:23 +01001903 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001904#else /* NUM_TILES_Y */
1905 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
1906#endif /* NUM_TILES_Y */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001907
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001908 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1909 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1910 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1911 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1912 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1913 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1914 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1915 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001916
1917#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001918 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1919 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1920 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1921 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1922 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1923 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1924 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1925 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1926 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1927 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1928 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1929 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1930 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1931 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1932 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1933 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1934 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1935 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1936 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1937 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1938 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1939 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1940 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1941 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1942 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1943 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1944 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1945 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1946 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1947 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1948 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1949 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1950 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1951 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1952 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1953 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1954 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1955 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1956 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1957 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1958 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1959 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1960 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1961 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1962 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1963 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1964 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1965 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1966 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1967 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1968 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1969 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1970 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1971 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1972 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1973 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01001974#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Giorgio Arena149fdf32018-07-04 17:03:33 +01001975}
Georgios Pinitasffb57a02018-10-29 18:01:52 +00001976#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001977
1978#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1979/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
1980 *
1981 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
1982 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
1983 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
1984 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
1985 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001986 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001987 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01001988 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01001989 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1990 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1991 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1992 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1993 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1994 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1995 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1996 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
1997 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1998 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1999 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2000 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2001 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2002 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2003 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002004 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2005 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002006 */
2007__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
2008 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002009 TENSOR3D_DECLARATION(dst),
2010 uint src_stride_w,
2011 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002012{
2013 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2014 src_stride_x,
2015 src_step_x,
2016 src_stride_y,
2017 src_step_y,
2018 src_stride_z,
2019 src_step_z,
2020 src_offset_first_element_in_bytes,
2021 dst_ptr,
2022 dst_stride_x,
2023 dst_step_x,
2024 dst_stride_y,
2025 dst_step_y,
2026 dst_stride_z,
2027 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002028 dst_offset_first_element_in_bytes,
2029 src_stride_w,
2030 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002031}
2032
2033/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
2034 *
2035 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2036 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2037 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
2038 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2039 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002040 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002041 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002042 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002043 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2044 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2045 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2046 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2047 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2048 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2049 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2050 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2051 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2052 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2053 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2054 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2055 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2056 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2057 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002058 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2059 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002060 */
2061__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
2062 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002063 TENSOR3D_DECLARATION(dst),
2064 uint src_stride_w,
2065 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002066{
2067 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2068 src_stride_x,
2069 src_step_x,
2070 src_stride_y,
2071 src_step_y,
2072 src_stride_z,
2073 src_step_z,
2074 src_offset_first_element_in_bytes,
2075 dst_ptr,
2076 dst_stride_x,
2077 dst_step_x,
2078 dst_stride_y,
2079 dst_step_y,
2080 dst_stride_z,
2081 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002082 dst_offset_first_element_in_bytes,
2083 src_stride_w,
2084 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002085}
2086
2087/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
2088 *
2089 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2090 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2091 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2092 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2093 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002094 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002095 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002096 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002097 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2098 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2099 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2100 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2101 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2102 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2103 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2104 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2105 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2106 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2107 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2108 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2109 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2110 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2111 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002112 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2113 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002114 */
2115__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
2116 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002117 TENSOR3D_DECLARATION(dst),
2118 uint src_stride_w,
2119 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002120{
2121 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2122 src_stride_x,
2123 src_step_x,
2124 src_stride_y,
2125 src_step_y,
2126 src_stride_z,
2127 src_step_z,
2128 src_offset_first_element_in_bytes,
2129 dst_ptr,
2130 dst_stride_x,
2131 dst_step_x,
2132 dst_stride_y,
2133 dst_step_y,
2134 dst_stride_z,
2135 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002136 dst_offset_first_element_in_bytes,
2137 src_stride_w,
2138 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002139}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002140
2141/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
2142 *
2143 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2144 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2145 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
2146 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2147 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002148 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002149 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002150 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002151 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2153 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002166 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2167 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002168 */
2169__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
2170 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002171 TENSOR3D_DECLARATION(dst),
2172 uint src_stride_w,
2173 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002174{
2175 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2176 src_stride_x,
2177 src_step_x,
2178 src_stride_y,
2179 src_step_y,
2180 src_stride_z,
2181 src_step_z,
2182 src_offset_first_element_in_bytes,
2183 dst_ptr,
2184 dst_stride_x,
2185 dst_step_x,
2186 dst_stride_y,
2187 dst_step_y,
2188 dst_stride_z,
2189 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002190 dst_offset_first_element_in_bytes,
2191 src_stride_w,
2192 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002193}
2194
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002195#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002196/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
2197 *
2198 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2199 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2200 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2201 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2202 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2203 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2204 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002205 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002206 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002207 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002208 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2209 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2210 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2211 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2212 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2213 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2214 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2215 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2216 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2217 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2218 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2219 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2220 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2221 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2222 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002223 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2224 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002225 */
2226__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
2227 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002228 TENSOR3D_DECLARATION(dst),
2229 uint src_stride_w,
2230 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002231{
2232 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2233 src_stride_x,
2234 src_step_x,
2235 src_stride_y,
2236 src_step_y,
2237 src_stride_z,
2238 src_step_z,
2239 src_offset_first_element_in_bytes,
2240 dst_ptr,
2241 dst_stride_x,
2242 dst_step_x,
2243 dst_stride_y,
2244 dst_step_y,
2245 dst_stride_z,
2246 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002247 dst_offset_first_element_in_bytes,
2248 src_stride_w,
2249 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002250}
2251
2252/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
2253 *
2254 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2255 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2256 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2257 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2258 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
2259 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2260 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002261 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002262 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002263 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002264 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2265 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2266 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2267 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2268 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2269 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2270 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2271 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2272 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2273 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2274 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2275 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2276 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2277 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2278 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002279 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2280 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002281 */
2282__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
2283 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002284 TENSOR3D_DECLARATION(dst),
2285 uint src_stride_w,
2286 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002287{
2288 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2289 src_stride_x,
2290 src_step_x,
2291 src_stride_y,
2292 src_step_y,
2293 src_stride_z,
2294 src_step_z,
2295 src_offset_first_element_in_bytes,
2296 dst_ptr,
2297 dst_stride_x,
2298 dst_step_x,
2299 dst_stride_y,
2300 dst_step_y,
2301 dst_stride_z,
2302 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002303 dst_offset_first_element_in_bytes,
2304 src_stride_w,
2305 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002306}
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002307
2308/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
2309 *
2310 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
2311 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2312 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2313 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2314 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
2315 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
2316 * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
2317 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
2318 *
2319 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
2320 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2321 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2322 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2323 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2324 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2325 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2326 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2327 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2328 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2329 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2330 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2331 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2332 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2333 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2334 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2335 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2336 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
2337 */
2338__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
2339 TENSOR3D_DECLARATION(src),
2340 TENSOR3D_DECLARATION(dst),
2341 uint src_stride_w,
2342 uint dst_stride_w)
2343{
2344 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2345 src_stride_x,
2346 src_step_x,
2347 src_stride_y,
2348 src_step_y,
2349 src_stride_z,
2350 src_step_z,
2351 src_offset_first_element_in_bytes,
2352 dst_ptr,
2353 dst_stride_x,
2354 dst_step_x,
2355 dst_stride_y,
2356 dst_step_y,
2357 dst_stride_z,
2358 dst_step_z,
2359 dst_offset_first_element_in_bytes,
2360 src_stride_w,
2361 dst_stride_w);
2362}
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002363#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002364#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
2365
2366#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
2367/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
2368 *
2369 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2370 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2371 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2372 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2373 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002374 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002375 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002376 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002377 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2378 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2379 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2380 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2381 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2382 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2383 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2384 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2385 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2386 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2387 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2388 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2389 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2390 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2391 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002392 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2393 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002394 */
2395__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
2396 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002397 TENSOR3D_DECLARATION(dst),
2398 uint src_stride_w,
2399 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002400{
2401 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2402 src_stride_x,
2403 src_step_x,
2404 src_stride_y,
2405 src_step_y,
2406 src_stride_z,
2407 src_step_z,
2408 src_offset_first_element_in_bytes,
2409 dst_ptr,
2410 dst_stride_x,
2411 dst_step_x,
2412 dst_stride_y,
2413 dst_step_y,
2414 dst_stride_z,
2415 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002416 dst_offset_first_element_in_bytes,
2417 src_stride_w,
2418 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002419}
2420
2421/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
2422 *
2423 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2424 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2425 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2426 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
2427 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002428 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002429 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002430 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002431 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2432 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2433 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2434 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2435 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2436 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2437 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2438 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2439 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2440 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2441 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2442 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2443 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2444 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2445 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002446 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2447 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002448 */
2449__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
2450 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002451 TENSOR3D_DECLARATION(dst),
2452 uint src_stride_w,
2453 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002454{
2455 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2456 src_stride_x,
2457 src_step_x,
2458 src_stride_y,
2459 src_step_y,
2460 src_stride_z,
2461 src_step_z,
2462 src_offset_first_element_in_bytes,
2463 dst_ptr,
2464 dst_stride_x,
2465 dst_step_x,
2466 dst_stride_y,
2467 dst_step_y,
2468 dst_stride_z,
2469 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002470 dst_offset_first_element_in_bytes,
2471 src_stride_w,
2472 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002473}
2474
2475/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
2476 *
2477 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2478 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2479 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2480 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2481 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002482 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002483 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002484 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002485 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2486 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2487 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2488 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2489 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2490 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2491 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2492 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2493 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2494 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2495 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2496 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2497 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2498 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2499 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002500 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2501 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002502 */
2503__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
2504 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002505 TENSOR3D_DECLARATION(dst),
2506 uint src_stride_w,
2507 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002508{
2509 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2510 src_stride_x,
2511 src_step_x,
2512 src_stride_y,
2513 src_step_y,
2514 src_stride_z,
2515 src_step_z,
2516 src_offset_first_element_in_bytes,
2517 dst_ptr,
2518 dst_stride_x,
2519 dst_step_x,
2520 dst_stride_y,
2521 dst_step_y,
2522 dst_stride_z,
2523 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002524 dst_offset_first_element_in_bytes,
2525 src_stride_w,
2526 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002527}
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002528
2529/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
2530 *
2531 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2532 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2533 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2534 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2535 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002536 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002537 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002538 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002539 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2540 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2541 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2542 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2543 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2544 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2545 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2546 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2547 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2548 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2549 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2550 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2551 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2552 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2553 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002554 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2555 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002556 */
2557__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
2558 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002559 TENSOR3D_DECLARATION(dst),
2560 uint src_stride_w,
2561 uint dst_stride_w)
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002562{
2563 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2564 src_stride_x,
2565 src_step_x,
2566 src_stride_y,
2567 src_step_y,
2568 src_stride_z,
2569 src_step_z,
2570 src_offset_first_element_in_bytes,
2571 dst_ptr,
2572 dst_stride_x,
2573 dst_step_x,
2574 dst_stride_y,
2575 dst_step_y,
2576 dst_stride_z,
2577 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002578 dst_offset_first_element_in_bytes,
2579 src_stride_w,
2580 dst_stride_w);
Gian Marco Iodice876be2a2018-07-03 12:22:09 +01002581}
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002582
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002583#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002584/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002585 *
2586 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002587 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2588 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002589 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002590 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002591 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002592 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002593 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002594 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002595 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002596 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2597 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2598 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2599 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2600 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2601 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2602 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2603 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2604 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2605 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2606 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2607 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2608 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2609 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2610 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002611 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2612 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002613 */
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002614__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002615 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002616 TENSOR3D_DECLARATION(dst),
2617 uint src_stride_w,
2618 uint dst_stride_w)
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002619{
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002620 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2621 src_stride_x,
2622 src_step_x,
2623 src_stride_y,
2624 src_step_y,
2625 src_stride_z,
2626 src_step_z,
2627 src_offset_first_element_in_bytes,
2628 dst_ptr,
2629 dst_stride_x,
2630 dst_step_x,
2631 dst_stride_y,
2632 dst_step_y,
2633 dst_stride_z,
2634 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002635 dst_offset_first_element_in_bytes,
2636 src_stride_w,
2637 dst_stride_w);
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002638}
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002639
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002640/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
2641 *
2642 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
2643 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2644 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2645 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2646 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2647 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
2648 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002649 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002650 *
Vidhya Sudhan Loganathan71ecf392018-08-31 16:10:16 +01002651 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002652 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2653 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2654 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2655 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2656 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2657 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2658 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2659 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2660 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2661 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2662 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2663 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2664 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2665 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2666 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002667 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2668 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002669 */
2670__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
2671 TENSOR3D_DECLARATION(src),
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002672 TENSOR3D_DECLARATION(dst),
2673 uint src_stride_w,
2674 uint dst_stride_w)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002675{
2676 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2677 src_stride_x,
2678 src_step_x,
2679 src_stride_y,
2680 src_step_y,
2681 src_stride_z,
2682 src_step_z,
2683 src_offset_first_element_in_bytes,
2684 dst_ptr,
2685 dst_stride_x,
2686 dst_step_x,
2687 dst_stride_y,
2688 dst_step_y,
2689 dst_stride_z,
2690 dst_step_z,
Georgios Pinitasc55beee2018-10-23 15:23:23 +01002691 dst_offset_first_element_in_bytes,
2692 src_stride_w,
2693 dst_stride_w);
Giorgio Arenaa50e5e02018-07-02 13:42:23 +01002694}
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002695
2696/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
2697 *
2698 * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
2699 * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
2700 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
2701 * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
2702 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
2703 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
2704 * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
2705 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
2706 *
2707 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
2708 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
2709 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2710 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
2711 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2712 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
2713 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2714 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
2715 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
2716 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2717 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2718 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2719 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2720 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2721 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
2722 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2723 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
2724 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
2725 */
2726__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
2727 TENSOR3D_DECLARATION(src),
2728 TENSOR3D_DECLARATION(dst),
2729 uint src_stride_w,
2730 uint dst_stride_w)
2731{
2732 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2733 src_stride_x,
2734 src_step_x,
2735 src_stride_y,
2736 src_step_y,
2737 src_stride_z,
2738 src_step_z,
2739 src_offset_first_element_in_bytes,
2740 dst_ptr,
2741 dst_stride_x,
2742 dst_step_x,
2743 dst_stride_y,
2744 dst_step_y,
2745 dst_stride_z,
2746 dst_step_z,
2747 dst_offset_first_element_in_bytes,
2748 src_stride_w,
2749 dst_stride_w);
2750}
Georgios Pinitasffb57a02018-10-29 18:01:52 +00002751#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
Gian Marco Iodiced28b7512018-07-06 12:59:28 +01002752#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
Michele Di Giorgiof955d512019-02-27 14:26:51 +00002753#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)