blob: 274ec200469ec9cc87feb9b4f6139df6a9f126e6 [file] [log] [blame]
Gian Marco76faef82018-01-29 12:15:32 +00001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Gian Marco76faef82018-01-29 12:15:32 +000026#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
Gian Marco76faef82018-01-29 12:15:32 +000027
28#if ELEMENT_SIZE == 1
29#define COND_DATA_TYPE char
30#elif ELEMENT_SIZE == 2
31#define COND_DATA_TYPE short
32#elif ELEMENT_SIZE == 4
33#define COND_DATA_TYPE int
34#else // ELEMENT_SIZE
35#error "Element size not support"
36#endif // ELEMENT_SIZE
37
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +010038#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
39/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
Gian Marco76faef82018-01-29 12:15:32 +000040 *
Gian Marco76faef82018-01-29 12:15:32 +000041 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
42 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +010043 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
Gian Marco76faef82018-01-29 12:15:32 +000044 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
45 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
46 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010047 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Gian Marco76faef82018-01-29 12:15:32 +000048 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
49 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
50 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
51 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
52 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
53 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
54 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
55 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
56 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
57 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
58 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
59 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
60 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
61 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
62 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
63 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +010064__kernel void im2col1x1_stridex1_nchw(
Gian Marco76faef82018-01-29 12:15:32 +000065 TENSOR3D_DECLARATION(src),
66 IMAGE_DECLARATION(dst),
67 uint src_stride_w,
68 uint dst_stride_w)
69{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +010070 const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
71 const uint yc = get_global_id(1); // y coordinate in the convolved tensor
72 const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map
73 const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
Gian Marco76faef82018-01-29 12:15:32 +000074
75 // Clamp xc
76 // The strategy clamps at "xc" as it will be a valid value for sure
77 uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
78
79 // Check which values are valid
80 const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
81
82 xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
83
84 // Calculate input indices
85 const uint xi = xc;
86 const uint yi = yc * STRIDE_Y;
87
88 // Calculate output indices
89 const uint xo = ch;
90 const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
91
92 // Get input and output address
93 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
94
95 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
96
97 VEC_DATA_TYPE(DATA_TYPE, 4)
98 data = vload4(0, (__global DATA_TYPE *)input_ptr);
99
100 // If out-of-bound, overwrite with the first element
101 data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
102
103 *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
104 *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
105 *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
106 *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
107
108#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100109 if(ch == (SRC_DEPTH - 1))
Gian Marco76faef82018-01-29 12:15:32 +0000110 {
111 *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
112 *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
113 *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
114 *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
115 }
116#endif // HAS_BIAS
117}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100118#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
Gian Marco76faef82018-01-29 12:15:32 +0000119
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100120#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
121#if defined(DILATION_X) && defined(DILATION_Y)
122/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
Pablo Tello4a626a72018-04-04 10:01:14 +0100123 *
124 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
125 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
126 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100127 * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
Pablo Tello4a626a72018-04-04 10:01:14 +0100128 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
129 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
130 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
Georgios Pinitas19ea4192018-06-19 13:09:53 +0100131 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Pablo Tello4a626a72018-04-04 10:01:14 +0100132 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
133 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100134 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Pablo Tello4a626a72018-04-04 10:01:14 +0100135 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
136 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
137 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
138 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
139 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
140 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
141 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
142 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
143 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
144 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
145 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
146 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
147 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
148 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
149 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
150 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100151__kernel void im2col_generic_nchw(
Pablo Tello4a626a72018-04-04 10:01:14 +0100152 TENSOR3D_DECLARATION(src),
153 IMAGE_DECLARATION(dst),
154 uint src_stride_w,
155 uint dst_stride_w)
156{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100157 const int xc = get_global_id(0); // x coordinate in the convolved tensor
158 const int yc = get_global_id(1); // y coordinate in the convolved tensor
159 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
160 const int batch = get_global_id(2) / SRC_DEPTH; // batch size
Pablo Tello4a626a72018-04-04 10:01:14 +0100161
162 // Calculate input indices
163 const int xi = xc * STRIDE_X - PAD_LEFT;
164 const int yi = yc * STRIDE_Y - PAD_TOP;
165
166 // Calculate output indices
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100167 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
Pablo Tello4a626a72018-04-04 10:01:14 +0100168 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
169
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100170 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
171 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
Pablo Tello4a626a72018-04-04 10:01:14 +0100172
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100173 // Linearize convolution elements
Pablo Tello4a626a72018-04-04 10:01:14 +0100174 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
175 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100176 int y = yi + yk * DILATION_Y;
177 for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
Pablo Tello4a626a72018-04-04 10:01:14 +0100178 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100179 int x = xi + xk * DILATION_X;
180#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
181 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
182#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
183 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
Pablo Tello4a626a72018-04-04 10:01:14 +0100184 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100185 *output_ptr = PAD_VALUE;
Pablo Tello4a626a72018-04-04 10:01:14 +0100186 }
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100187 else
Pablo Tello4a626a72018-04-04 10:01:14 +0100188 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100189 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
Pablo Tello4a626a72018-04-04 10:01:14 +0100190 }
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100191#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
Pablo Tello4a626a72018-04-04 10:01:14 +0100192 }
193 }
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100194
Pablo Tello4a626a72018-04-04 10:01:14 +0100195#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100196 if(ch == (SRC_DEPTH - 1))
Pablo Tello4a626a72018-04-04 10:01:14 +0100197 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100198 *output_ptr = 1.0f;
Pablo Tello4a626a72018-04-04 10:01:14 +0100199 }
200#endif // HAS_BIAS
201}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100202#endif // defined(DILATION_X) && defined(DILATION_Y)
Pablo Tello4a626a72018-04-04 10:01:14 +0100203
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100204/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
Pablo Tello4a626a72018-04-04 10:01:14 +0100205 *
206 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
207 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
208 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100209 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
Pablo Tello4a626a72018-04-04 10:01:14 +0100210 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
211 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
212 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
213 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
214 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100215 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Pablo Tello4a626a72018-04-04 10:01:14 +0100216 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
217 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
218 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
219 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
220 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
221 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
222 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
223 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
224 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
225 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
226 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
227 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
228 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
229 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
230 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
231 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100232__kernel void im2col3x3_nchw(
Pablo Tello4a626a72018-04-04 10:01:14 +0100233 TENSOR3D_DECLARATION(src),
234 IMAGE_DECLARATION(dst),
235 uint src_stride_w,
236 uint dst_stride_w)
237{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100238 const int xc = get_global_id(0); // x coordinate in the convolved tensor
239 const int yc = get_global_id(1); // y coordinate in the convolved tensor
240 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
241 const int batch = get_global_id(2) / SRC_DEPTH; // batch size
Gian Marco76faef82018-01-29 12:15:32 +0000242
243 // Calculate input indices
244 const int xi = xc * STRIDE_X - PAD_LEFT;
245 const int yi = yc * STRIDE_Y - PAD_TOP;
246
247 // Calculate output indices
248 const int xo = ch * 9; // 3x3
249 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
250
251 // Get input and output address
252 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
253
254 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
255
256 VEC_DATA_TYPE(DATA_TYPE, 3)
257 row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
258 VEC_DATA_TYPE(DATA_TYPE, 3)
259 row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
260 VEC_DATA_TYPE(DATA_TYPE, 3)
261 row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
262
263#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
264 // Put 0 if the value is out-of-bound
265 int3 x = (int3)xi + (int3)(0, 1, 2);
266 int3 y = (int3)yi + (int3)(0, 1, 2);
267
268 VEC_DATA_TYPE(COND_DATA_TYPE, 3)
269 cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
270 VEC_DATA_TYPE(COND_DATA_TYPE, 3)
271 cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
272 VEC_DATA_TYPE(COND_DATA_TYPE, 3)
273 cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
274
275 row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
276 row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
277 row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
278#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
279
280 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
281 *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
282
283#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100284 if(ch == (SRC_DEPTH - 1))
Gian Marco76faef82018-01-29 12:15:32 +0000285 {
286 *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
287 }
288#endif // HAS_BIAS
289}
290
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100291/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
Gian Marco76faef82018-01-29 12:15:32 +0000292 *
293 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
294 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
295 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100296 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
Gian Marco76faef82018-01-29 12:15:32 +0000297 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
298 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
299 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
300 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
301 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Gian Marco76faef82018-01-29 12:15:32 +0000303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
307 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
308 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
309 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
310 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
316 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
317 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
318 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100319__kernel void im2col5x5_nchw(
Gian Marco76faef82018-01-29 12:15:32 +0000320 TENSOR3D_DECLARATION(src),
321 IMAGE_DECLARATION(dst),
322 uint src_stride_w,
323 uint dst_stride_w)
324{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100325 const int xc = get_global_id(0); // x coordinate in the convolved tensor
326 const int yc = get_global_id(1); // y coordinate in the convolved tensor
327 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
328 const int batch = get_global_id(2) / SRC_DEPTH; // batch size
Gian Marco76faef82018-01-29 12:15:32 +0000329
330 // Calculate input indices
331 const int xi = xc * STRIDE_X - PAD_LEFT;
332 const int yi = yc * STRIDE_Y - PAD_TOP;
333
334 // Calculate output indices
335 const int xo = ch * 25; // 5x5
336 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
337
338#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
339 // Put 0 if the value is out-of-bound
340 int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
341 int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
342 int x1 = xi + 4;
343 int y1 = yi + 4;
344
345 // Check if we could have out-of-bounds elements in the x direction
346 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
347 x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
348 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
349 y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
350 COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
351 COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
352#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
353
354 // Get input and output address
355 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
356
357 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
358
359 {
360 VEC_DATA_TYPE(DATA_TYPE, 4)
361 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
362 DATA_TYPE
363 row01 = *((__global DATA_TYPE *)input_ptr + 4);
364
365 input_ptr += src_stride_y;
366
367 VEC_DATA_TYPE(DATA_TYPE, 4)
368 row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
369 DATA_TYPE
370 row11 = *((__global DATA_TYPE *)input_ptr + 4);
371
372#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
373 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
374 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
375 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
376 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
377 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
378 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
379
380 // Replace with 0 if the value is not valid
381 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
382 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
383 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
384 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
385#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
386
387 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
388 row10.s012),
389 0, (__global DATA_TYPE *)output_ptr);
390 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
391
392 input_ptr += src_stride_y;
393 output_ptr += 10 * dst_stride_x;
394 }
395
396 {
397 VEC_DATA_TYPE(DATA_TYPE, 4)
398 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
399 DATA_TYPE
400 row01 = *((__global DATA_TYPE *)input_ptr + 4);
401
402 input_ptr += src_stride_y;
403
404 VEC_DATA_TYPE(DATA_TYPE, 4)
405 row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
406 DATA_TYPE
407 row11 = *((__global DATA_TYPE *)input_ptr + 4);
408
409#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
410 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
411 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
412 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
413 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
414 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
415 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
416
417 // Replace with 0 if the value is not valid
418 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
419 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
420 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
421 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
422#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
423
424 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
425 row10.s012),
426 0, (__global DATA_TYPE *)output_ptr);
427 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
428
429 input_ptr += src_stride_y;
430 output_ptr += 10 * dst_stride_x;
431 }
432
433 {
434 VEC_DATA_TYPE(DATA_TYPE, 4)
435 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
436 DATA_TYPE
437 row01 = *((__global DATA_TYPE *)input_ptr + 4);
438
439 input_ptr += src_stride_y;
440
441#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
442 VEC_DATA_TYPE(COND_DATA_TYPE, 4)
443 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
444 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
445
446 // Replace with 0 if the value is not valid
447 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
448 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
449#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
450
451 vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
452 *((__global DATA_TYPE *)output_ptr + 4) = row01;
453
454 output_ptr += 5 * dst_stride_x;
455 }
456
457#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100458 if(ch == (SRC_DEPTH - 1))
Gian Marco76faef82018-01-29 12:15:32 +0000459 {
460 *((__global DATA_TYPE *)output_ptr) = 1.0f;
461 }
462#endif // HAS_BIAS
463}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100464#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
Gian Marco76faef82018-01-29 12:15:32 +0000465
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100466#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
467/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
Gian Marco76faef82018-01-29 12:15:32 +0000468 *
469 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
470 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100471 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
Gian Marco76faef82018-01-29 12:15:32 +0000472 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
473 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
474 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100475 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Gian Marco76faef82018-01-29 12:15:32 +0000476 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
477 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
478 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
479 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
480 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
481 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
482 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
483 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
484 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
485 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
486 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
487 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
488 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
489 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
490 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
491 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100492__kernel void im2col11x11_padx0_pady0_nchw(
Gian Marco76faef82018-01-29 12:15:32 +0000493 TENSOR3D_DECLARATION(src),
494 IMAGE_DECLARATION(dst),
495 uint src_stride_w,
496 uint dst_stride_w)
497{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100498 const int xc = get_global_id(0); // x coordinate in the convolved tensor
499 const int yc = get_global_id(1); // y coordinate in the convolved tensor
500 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
501 const int batch = get_global_id(2) / SRC_DEPTH; // batch size
Gian Marco76faef82018-01-29 12:15:32 +0000502
503 // Calculate input indices
504 const int xi = xc * STRIDE_X;
505 const int yi = yc * STRIDE_Y;
506
507 // Calculate output indices
508 const int xo = ch * 121; // 11x11
509 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
510
511 // Get input and output address
512 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
513
514 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
515 {
516 VEC_DATA_TYPE(DATA_TYPE, 8)
517 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
518 VEC_DATA_TYPE(DATA_TYPE, 3)
519 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
520
521 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
522 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
523
524 input_ptr += src_stride_y;
525 output_ptr += 11 * src_stride_x;
526 }
527
528 {
529 VEC_DATA_TYPE(DATA_TYPE, 8)
530 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
531 VEC_DATA_TYPE(DATA_TYPE, 3)
532 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
533
534 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
535 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
536
537 input_ptr += src_stride_y;
538 output_ptr += 11 * src_stride_x;
539 }
540
541 {
542 VEC_DATA_TYPE(DATA_TYPE, 8)
543 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
544 VEC_DATA_TYPE(DATA_TYPE, 3)
545 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
546
547 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
548 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
549
550 input_ptr += src_stride_y;
551 output_ptr += 11 * src_stride_x;
552 }
553
554 {
555 VEC_DATA_TYPE(DATA_TYPE, 8)
556 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
557 VEC_DATA_TYPE(DATA_TYPE, 3)
558 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
559
560 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
561 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
562
563 input_ptr += src_stride_y;
564 output_ptr += 11 * src_stride_x;
565 }
566
567 {
568 VEC_DATA_TYPE(DATA_TYPE, 8)
569 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
570 VEC_DATA_TYPE(DATA_TYPE, 3)
571 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
572
573 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
574 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
575
576 input_ptr += src_stride_y;
577 output_ptr += 11 * src_stride_x;
578 }
579
580 {
581 VEC_DATA_TYPE(DATA_TYPE, 8)
582 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
583 VEC_DATA_TYPE(DATA_TYPE, 3)
584 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
585
586 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
587 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
588
589 input_ptr += src_stride_y;
590 output_ptr += 11 * src_stride_x;
591 }
592
593 {
594 VEC_DATA_TYPE(DATA_TYPE, 8)
595 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
596 VEC_DATA_TYPE(DATA_TYPE, 3)
597 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
598
599 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
600 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
601
602 input_ptr += src_stride_y;
603 output_ptr += 11 * src_stride_x;
604 }
605
606 {
607 VEC_DATA_TYPE(DATA_TYPE, 8)
608 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
609 VEC_DATA_TYPE(DATA_TYPE, 3)
610 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
611
612 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
613 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
614
615 input_ptr += src_stride_y;
616 output_ptr += 11 * src_stride_x;
617 }
618
619 {
620 VEC_DATA_TYPE(DATA_TYPE, 8)
621 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
622 VEC_DATA_TYPE(DATA_TYPE, 3)
623 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
624
625 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
626 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
627
628 input_ptr += src_stride_y;
629 output_ptr += 11 * src_stride_x;
630 }
631
632 {
633 VEC_DATA_TYPE(DATA_TYPE, 8)
634 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
635 VEC_DATA_TYPE(DATA_TYPE, 3)
636 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
637
638 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
639 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
640
641 input_ptr += src_stride_y;
642 output_ptr += 11 * src_stride_x;
643 }
644
645 {
646 VEC_DATA_TYPE(DATA_TYPE, 8)
647 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
648 VEC_DATA_TYPE(DATA_TYPE, 3)
649 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
650
651 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
652 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
653
654 output_ptr += 11 * src_stride_x;
655 }
656
657#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100658 if(ch == (SRC_DEPTH - 1))
Gian Marco76faef82018-01-29 12:15:32 +0000659 {
660 *((__global DATA_TYPE *)output_ptr) = 1.0f;
661 }
662#endif // HAS_BIAS
663}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100664#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
Gian Marco76faef82018-01-29 12:15:32 +0000665
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100666#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
667/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
Gian Marco76faef82018-01-29 12:15:32 +0000668 *
669 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
670 * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
671 * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100672 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
Gian Marco76faef82018-01-29 12:15:32 +0000673 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
674 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100675 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Gian Marco76faef82018-01-29 12:15:32 +0000676 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
677 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
678 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
679 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
680 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
681 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
682 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
683 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
684 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
685 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
686 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
687 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
688 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
689 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
690 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
691 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100692__kernel void im2col_generic_padx0_pady0_nchw(
Gian Marco76faef82018-01-29 12:15:32 +0000693 TENSOR3D_DECLARATION(src),
694 IMAGE_DECLARATION(dst),
695 uint src_stride_w,
696 uint dst_stride_w)
697{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100698 const int xc = get_global_id(0); // x coordinate in the convolved tensor
699 const int yc = get_global_id(1); // y coordinate in the convolved tensor
700 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
701 const int batch = get_global_id(2) / SRC_DEPTH; // batch size
Gian Marco76faef82018-01-29 12:15:32 +0000702
703 // Calculate input indices
704 const int xi = xc * STRIDE_X;
705 const int yi = yc * STRIDE_Y;
706 // Calculate output indices
707 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
708 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
709 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
710 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
711 // Linearize convolution elements
712 for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
713 {
714 int last_x = 0;
715 for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
716 {
717 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
718 row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
719 VSTORE(VECTOR_SIZE)
720 (row, 0, output_ptr);
721 last_x = x;
722 }
723 // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
724 // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
725#if WIDTH_MOD_VECTOR_SIZE == 1
726 *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
727#elif WIDTH_MOD_VECTOR_SIZE > 1
728 VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
729 row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
730 VSTORE(WIDTH_MOD_VECTOR_SIZE)
731 (row, 0, output_ptr);
732#endif /* WIDTH_MOD_VECTOR_SIZE */
733 output_ptr += WIDTH_MOD_VECTOR_SIZE;
734 } /* End of loop over KERNEL_HEIGHT */
735
736#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100737 if(ch == (SRC_DEPTH - 1))
Gian Marco76faef82018-01-29 12:15:32 +0000738 {
Gian Marco76faef82018-01-29 12:15:32 +0000739 *output_ptr = 1.0f;
Gian Marco76faef82018-01-29 12:15:32 +0000740 }
741#endif // HAS_BIAS
742}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100743#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
Gian Marco76faef82018-01-29 12:15:32 +0000744
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100745#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
746
747#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
748
749/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
750 *
751 * @note This kernel computes VECTOR_SIZE elements
752 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
753 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
754 * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
755 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
756 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
757 *
758 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
759 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
760 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
761 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
762 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
763 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
764 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
765 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
766 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
767 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
768 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
769 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
770 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
771 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
772 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
773 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
774 */
775__kernel void im2col3x3_nhwc(
776 TENSOR3D_DECLARATION(src),
777 IMAGE_DECLARATION(dst),
778 uint src_stride_w,
779 uint dst_stride_w)
780{
781 const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
782 const int yo = get_global_id(1);
783 const int batch = get_global_id(2); // batch size
784
785 // Calculate input indices
786 const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
787 const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
788
789 // Get input and output address
790 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
791 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
792
793 int yi_coord = 0;
794 int3 offset = 0;
795
796 // Clamp xi
797 int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
798#if PAD_TOP != 0 || PAD_BOTTOM != 0
799#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
800 xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
801#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
802 xi_offset *= (int3)src_stride_y;
803
804 // Out-of-bound condition for X
805 int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
806
807 // yi == 0
808 // Clamp yi
809 // yi_coord is casted to unsigned int in order to use just a min() operation
810 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
811 yi_coord = yi - (int)PAD_TOP;
812
813 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
814#if PAD_TOP != 0 || PAD_BOTTOM != 0
815 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
816#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
817
818 // Compute offset
819 offset = xi_offset + (yi_coord * (int)src_stride_z);
820
821 // Load input values
822 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
823 VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
824 VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
825
826#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
827 // Replace invalid values with PAD_VALUE
828 int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
829 values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
830 values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
831 values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
832#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
833
834 // yi == 1
835 // Clamp yi_coord (it can be negative if PAD_TOP > 1)
836 yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
837
838 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
839#if PAD_TOP != 0 || PAD_BOTTOM != 0
840 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
841#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
842
843 // Compute offset
844 offset = xi_offset + (yi_coord * (int)src_stride_z);
845
846 // Load input values
847 VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
848 VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
849 VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
850
851#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
852 // Replace invalid values with zeros
853 y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT));
854 values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
855 values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
856 values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
857#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
858
859 // yi == 2
860 // Clamp yi_coord
861 yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
862
863 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
864#if PAD_TOP != 0 || PAD_BOTTOM != 0
865 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
866#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
867
868 // Compute offset
869 offset = xi_offset + (yi_coord * (int)src_stride_z);
870
871 // Load input values
872 VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
873 VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
874 VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
875
876#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
877 // Replace invalid values with PAD_VALUE
878 y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT));
879 values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
880 values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
881 values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
882#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
883
884 // Store
885 VSTORE(VECTOR_SIZE)
886 (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);
887 VSTORE(VECTOR_SIZE)
888 (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);
889 VSTORE(VECTOR_SIZE)
890 (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);
891 VSTORE(VECTOR_SIZE)
892 (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);
893 VSTORE(VECTOR_SIZE)
894 (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);
895 VSTORE(VECTOR_SIZE)
896 (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);
897 VSTORE(VECTOR_SIZE)
898 (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);
899 VSTORE(VECTOR_SIZE)
900 (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);
901 VSTORE(VECTOR_SIZE)
902 (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);
903
904#ifdef HAS_BIAS
905 if((ch + VECTOR_SIZE) >= SRC_DEPTH)
906 {
907 *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
908 }
909#endif // HAS_BIAS
910}
911
912/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
Gian Marco76faef82018-01-29 12:15:32 +0000913 *
914 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
915 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
916 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100917 * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
Gian Marco76faef82018-01-29 12:15:32 +0000918 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
919 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
920 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
Alex Gilday7da29b62018-03-23 14:16:00 +0000921 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Gian Marco76faef82018-01-29 12:15:32 +0000922 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
923 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100924 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
Gian Marco76faef82018-01-29 12:15:32 +0000925 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
926 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
927 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
928 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
929 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
930 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
931 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
932 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
933 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
934 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
935 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
936 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
937 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
938 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
939 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
940 */
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100941__kernel void im2col_generic_nhwc(
Gian Marco76faef82018-01-29 12:15:32 +0000942 TENSOR3D_DECLARATION(src),
943 IMAGE_DECLARATION(dst),
944 uint src_stride_w,
945 uint dst_stride_w)
946{
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100947 const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
948 const int yo = get_global_id(1);
949 const int batch = get_global_id(2); // batch size
Gian Marco76faef82018-01-29 12:15:32 +0000950
951 // Calculate input indices
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100952 const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
953 const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
Gian Marco76faef82018-01-29 12:15:32 +0000954
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100955 // Get input and output address
956 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
957 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
Gian Marco76faef82018-01-29 12:15:32 +0000958
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100959 int i = 0;
Alex Gilday7da29b62018-03-23 14:16:00 +0000960 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
Gian Marco76faef82018-01-29 12:15:32 +0000961 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100962 // Clamp yi_coord
963 int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
964 yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
965
966 // Out-of-bound condition for Y
967 int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
968
969 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
Gian Marco76faef82018-01-29 12:15:32 +0000970 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100971 // Clamp xi_coord
972 int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
973 xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
974
975 // Out-of-bound condition for X
976 int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
977
978 int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
979
980 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
981
982 // Replace with PAD_VALUE if the value is out-of-bound
983 values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));
984
985 // Store
986 VSTORE(VECTOR_SIZE)
987 (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
988
989 i++;
Gian Marco76faef82018-01-29 12:15:32 +0000990 }
991 }
992
993#ifdef HAS_BIAS
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100994 if((ch + VECTOR_SIZE) >= SRC_DEPTH)
Gian Marco76faef82018-01-29 12:15:32 +0000995 {
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100996 *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
Gian Marco76faef82018-01-29 12:15:32 +0000997 }
998#endif // HAS_BIAS
999}
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +01001000#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
Pablo Tello4a626a72018-04-04 10:01:14 +01001001#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)