blob: 40b5a2beb0ca32e5400b621229d0abb80c853cb9 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
Stephen Lie855c232018-01-04 14:13:22 +08002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier7068f992017-10-26 15:23:08 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
Anthony Barbier7068f992017-10-26 15:23:08 +010026
zhenglin57b20102018-01-05 14:39:50 +080027#include "helpers_cs.h"
Anthony Barbier7068f992017-10-26 15:23:08 +010028
zhenglin57b20102018-01-05 14:39:50 +080029#if defined(DATA_TYPE_FP16)
Anthony Barbier7068f992017-10-26 15:23:08 +010030precision mediump float;
zhenglin57b20102018-01-05 14:39:50 +080031#endif // DATA_TYPE_FP16
Anthony Barbier7068f992017-10-26 15:23:08 +010032
Stephen Lie855c232018-01-04 14:13:22 +080033#ifdef RESHAPE_TO_COLUMNS
34
35/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
36 *
37 * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
38 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
39 *
40 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
41 * @param[in] src_attrs The attributes of the source tensor
42 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
43 * @param[in] dst_attrs The attributes of the destination tensor
44 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
45 * @param[in] biases_attrs The attributes of the biases tensor
46 * @param[in] width The width of the input tensor
47 * @param[in] height The height of the input tensor
48 * @param[in] depth The depth of the input tensor
49 * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
50 */
51
52SHADER_PARAMS_DECLARATION
53{
54 Tensor3DAttributes src_attrs;
55 ImageAttributes dst_attrs;
56#ifdef HAS_BIAS
57 VectorAttributes biases_attrs;
58#endif /* HAS_BIAS */
59 uint width;
60 uint height;
61 uint depth;
62 uint total_filters;
63};
64
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +010065#if defined(DATA_TYPE_FP32)
66
67TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
68TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
69#ifdef HAS_BIAS
70TENSOR_DECLARATION(3, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
71#endif /* BIAS */
72
73void main()
74{
75 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
76 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
77#ifdef HAS_BIAS
78 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
79#endif /* BIAS */
80
81 bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
82 && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
83 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
84 gl_GlobalInvocationID.z)
85 * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
86 // Linearize convolution elements
87 if(is_last_thread)
88 {
89 for(uint i = 0u; i < uint(total_filters); ++i)
90 {
91 float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
92 STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
93 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
94#ifdef HAS_BIAS
95 float b = LOAD_CURRENT_ITEM(biases_ptr, biases_iter);
96 STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
97 TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, biases_attrs.stride_x);
98#endif /* HAS_BIAS */
99 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
100 }
101 }
102 else
103 {
104 for(uint i = 0u; i < uint(total_filters); ++i)
105 {
106 float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
107 STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
108 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
109 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
110 }
111 }
112}
113
114#elif defined(DATA_TYPE_FP16)
Stephen Lie855c232018-01-04 14:13:22 +0800115
116TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
117TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
118#ifdef HAS_BIAS
119TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
120#endif /* BIAS */
121
122void main()
123{
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100124 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
125 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
Stephen Lie855c232018-01-04 14:13:22 +0800126#ifdef HAS_BIAS
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100127 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Stephen Lie855c232018-01-04 14:13:22 +0800128#endif /* BIAS */
129
130 bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
131 && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
132 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
133 gl_GlobalInvocationID.z)
134 * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
135 // Linearize convolution elements
136 if(is_last_thread)
137 {
138 for(uint i = 0u; i < uint(total_filters); i = i + 2u)
139 {
140 vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
141 vec2 s;
142 if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
143 {
144 s.x = s0.x;
145 }
146 else
147 {
148 s.x = s0.y;
149 }
150 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
151
152 vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
153 if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
154 {
155 s.y = s1.x;
156 }
157 else
158 {
159 s.y = s1.y;
160 }
161 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
162 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
163#ifdef HAS_BIAS
164 vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
165 STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
166 TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x));
167#endif /* HAS_BIAS */
168 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
169 }
170 }
171 else
172 {
173 for(uint i = 0u; i < uint(total_filters); i = i + 2u)
174 {
175 vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
176 vec2 s;
177 if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
178 {
179 s.x = s0.x;
180 }
181 else
182 {
183 s.x = s0.y;
184 }
185 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
186
187 vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
188 if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
189 {
190 s.y = s1.x;
191 }
192 else
193 {
194 s.y = s1.y;
195 }
196 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
197 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
198 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
199 }
200 }
201}
202
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100203#endif /* DATA_TYPE_FP32 */
Stephen Lie855c232018-01-04 14:13:22 +0800204#endif // RESHAPE_TO_COLUMNS
205
Anthony Barbier7068f992017-10-26 15:23:08 +0100206#ifdef IM2COL_GENERIC
Stephen Lie855c232018-01-04 14:13:22 +0800207
Anthony Barbier7068f992017-10-26 15:23:08 +0100208/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
209 *
210 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
Stephen Lie855c232018-01-04 14:13:22 +0800211 * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx"
212 * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx"
213 * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
214 * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
215 * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
Alex Gilday7da29b62018-03-23 14:16:00 +0000216 * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"
Anthony Barbier7068f992017-10-26 15:23:08 +0100217 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
218 *
zhenglin57b20102018-01-05 14:39:50 +0800219 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
220 * @param[in] src_attrs The attributes of the source tensor
221 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
222 * @param[in] dst_attrs The attributes of the destination tensor
zhenglin57b20102018-01-05 14:39:50 +0800223 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
224 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
Anthony Barbier7068f992017-10-26 15:23:08 +0100225 */
Stephen Lie855c232018-01-04 14:13:22 +0800226
zhenglin57b20102018-01-05 14:39:50 +0800227SHADER_PARAMS_DECLARATION
228{
229 Tensor3DAttributes src_attrs;
230 ImageAttributes dst_attrs;
zhenglin57b20102018-01-05 14:39:50 +0800231 uint src_stride_w;
232 uint dst_stride_w;
233};
234
235#ifdef DATA_TYPE_FP32
Stephen Lie855c232018-01-04 14:13:22 +0800236
zhenglin57b20102018-01-05 14:39:50 +0800237TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
238TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
Stephen Lie855c232018-01-04 14:13:22 +0800239
Anthony Barbier7068f992017-10-26 15:23:08 +0100240void main(void)
241{
zhenglin57b20102018-01-05 14:39:50 +0800242 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
243 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
244
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100245 int xc = int(gl_GlobalInvocationID.x); // x coordinate in the convolved tensor
246 int yc = int(gl_GlobalInvocationID.y); // y coordinate in the convolved tensor
247 int ch = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map
248 int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch
Anthony Barbier7068f992017-10-26 15:23:08 +0100249
250 // Calculate input indeces
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100251 int xi = xc * STRIDE_X - PAD_LEFT;
252 int yi = yc * STRIDE_Y - PAD_TOP;
253 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w)));
Anthony Barbier7068f992017-10-26 15:23:08 +0100254
255 // Calculate output indeces
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100256 int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
257 int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
258 // sizeof is not available in GLES, so we'll use stride_x
259 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x));
Stephen Lie855c232018-01-04 14:13:22 +0800260
261 uint src_pos = 0u;
Anthony Barbier7068f992017-10-26 15:23:08 +0100262
263 // Linearize convolution elements
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100264 for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y)
Anthony Barbier7068f992017-10-26 15:23:08 +0100265 {
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100266 for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x)))
Anthony Barbier7068f992017-10-26 15:23:08 +0100267 {
Stephen Lie855c232018-01-04 14:13:22 +0800268#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100269 src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
Stephen Lie855c232018-01-04 14:13:22 +0800270 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
271#else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
Anthony Barbier7068f992017-10-26 15:23:08 +0100272 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
273 {
Stephen Lie855c232018-01-04 14:13:22 +0800274 STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f);
Anthony Barbier7068f992017-10-26 15:23:08 +0100275 }
276 else
277 {
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100278 src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
Stephen Lie855c232018-01-04 14:13:22 +0800279 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
Anthony Barbier7068f992017-10-26 15:23:08 +0100280 }
Stephen Lie855c232018-01-04 14:13:22 +0800281#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
Anthony Barbier7068f992017-10-26 15:23:08 +0100282 }
283 }
284
285#ifdef HAS_BIAS
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100286 if(ch == (KERNEL_DEPTH - 1))
Anthony Barbier7068f992017-10-26 15:23:08 +0100287 {
Stephen Lie855c232018-01-04 14:13:22 +0800288 STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);
Anthony Barbier7068f992017-10-26 15:23:08 +0100289 }
Stephen Lie855c232018-01-04 14:13:22 +0800290#endif /* HAS_BIAS */
Anthony Barbier7068f992017-10-26 15:23:08 +0100291}
zhenglin57b20102018-01-05 14:39:50 +0800292
293#elif defined(DATA_TYPE_FP16)
Stephen Lie855c232018-01-04 14:13:22 +0800294
zhenglin57b20102018-01-05 14:39:50 +0800295TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
296TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
297
Stephen Lie855c232018-01-04 14:13:22 +0800298#ifdef KERNEL_1x1
299
zhenglin57b20102018-01-05 14:39:50 +0800300void main(void)
301{
Stephen Lie855c232018-01-04 14:13:22 +0800302 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
303 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
304
305 uint xc = gl_GlobalInvocationID.x;
306 uint yc = gl_GlobalInvocationID.y;
307 uint zc = gl_GlobalInvocationID.z;
308 uint ch = zc % uint(KERNEL_DEPTH); // input feature map
309 uint batch = zc / uint(KERNEL_DEPTH); // the batch
310
311 // Calculate input indeces
312 uint xi = xc;
313 uint yi = yc;
314 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z);
315
316 // Calculate output indeces
317 uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
318 uint xo = ch * dst_element_count;
319 uint yo = xc + yc * uint(CONVOLVED_WIDTH);
320 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
321
322 bool x_start_even = ((xc % 2u) == 0u);
323 bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u);
324 uint input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y);
325 uint tmp_left = 0u;
326 uint tmp_right = 0u;
327
328 if(ch % 2u != 0u)
329 {
330 return;
331 }
332
333 if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1))))
334 {
335 tmp_left = LOAD(src_ptr, input_pos);
336 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z);
337 tmp_right = LOAD(src_ptr, input_pos);
338 if(x_start_even)
339 {
340 tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u);
341 }
342 else
343 {
344 tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u);
345 }
346 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
347 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
348
349#ifdef HAS_BIAS
350 if(ch == (uint(KERNEL_DEPTH) - 2u))
351 {
352 mediump vec2 bias_vec = vec2(1.f, 0.f);
353 uint bias_u = packHalf2x16(bias_vec);
354 STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u);
355 }
356#endif /* HAS_BIAS */
357 }
358 else
359 {
360 tmp_left = LOAD(src_ptr, input_pos);
361 if(x_start_even)
362 {
363 tmp_right = (tmp_left & 0xffffu);
364 }
365 else
366 {
367 tmp_right = (tmp_left >> 16u);
368 }
369
370#ifdef HAS_BIAS
371 mediump vec2 bias_vec = vec2(0.f, 1.f);
372 uint bias_u = packHalf2x16(bias_vec);
373 tmp_right += (bias_u & 0xffff0000u);
374#endif /* HAS_BIAS */
375
376 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
377 }
zhenglin57b20102018-01-05 14:39:50 +0800378}
379
Stephen Lie855c232018-01-04 14:13:22 +0800380#else /* KERNEL_1x1 */
381
382void main(void)
383{
384 uint xc = gl_GlobalInvocationID.x;
385 uint yc = gl_GlobalInvocationID.y;
386 uint zc = gl_GlobalInvocationID.z;
387 uint ch = zc % uint(KERNEL_DEPTH); // input feature map
388 uint batch = zc / uint(KERNEL_DEPTH); // the batch
389
390 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
391 Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
392 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
393
394 // Calculate input indeces
395 uint src_element_count = src_attrs.step_x / src_attrs.stride_x;
396 uint xi = (xc * uint(STRIDE_X)) / src_element_count;
397 uint yi = yc * uint(STRIDE_Y);
398 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z);
399
400 // Calculate output indeces
401 uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
402 uint xo = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count;
403 uint yo = xc + yc * uint(CONVOLVED_WIDTH);
404 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
405
406 bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u);
407 bool z_start_even = ((ch % 2u) == 0u);
408 uint input_pos = 0u;
409 uint tmp = 0u;
410 uint tmp_left = 0u;
411 uint tmp_right = 0u;
412
413 // Linearize convolution elements
414 for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
415 {
416 uint xstart = 0u;
417 uint xend = 0u;
418
419 // even col, even row
420 if(x_start_even)
421 {
422 if(((y - yi + ch) % 2u) == 0u)
423 {
424 for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
425 {
426 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
427 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
428 }
429 }
430 else
431 {
432 // 1st pair
433 if(!z_start_even && (y == yi))
434 {
435 // cross 2d feature map
436 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
437 (ch - 1u) * src_attrs.stride_z);
438 }
439 else
440 {
441 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
442 (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
443 }
444 tmp_right = LOAD(src_ptr, input_pos);
445 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
446 tmp_left = LOAD(src_ptr, input_pos);
447 tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u);
448 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
449 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
450
451 // remaining
452 for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
453 {
454 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y);
455 tmp_left = LOAD(src_ptr, input_pos);
456 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
457 tmp_right = LOAD(src_ptr, input_pos);
458 tmp_right = (tmp_left >> 16u) + (tmp_right << 16u);
459 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
460 }
461 }
462 }
463 else
464 {
465 if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even))
466 {
467 // 1st pair
468 if(y == yi)
469 {
470 // cross 2d feature map
471 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
472 (ch - 1u) * src_attrs.stride_z);
473 }
474 else
475 {
476 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
477 (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
478 }
479
480 tmp_right = LOAD(src_ptr, input_pos);
481 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
482 tmp_left = LOAD(src_ptr, input_pos);
483 tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u);
484 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
485 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
486
487 // remaining
488 for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
489 {
490 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
491 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
492 }
493 }
494 else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even))
495 {
496 // 1st pair
497 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
498 tmp_right = LOAD(src_ptr, input_pos);
499 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
500 tmp_left = LOAD(src_ptr, input_pos);
501 tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
502 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
503 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
504
505 // remaining
506 for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
507 {
508 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
509 tmp_right = LOAD(src_ptr, input_pos);
510 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
511 tmp_left = LOAD(src_ptr, input_pos);
512 tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
513 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
514 }
515 }
516 }
517 }
518
519 // NOTE: must handle last element manually instead of in loops
520 // to avoid write conflict across 2d boundary
521 if(ch == uint(KERNEL_DEPTH) - 1u)
522 {
523 uint x = xi + (uint(KERNEL_WIDTH) / 2u);
524 uint y = yi + uint(KERNEL_HEIGHT) - 1u;
525 input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
526 tmp = LOAD(src_ptr, input_pos);
527 if(!x_start_even)
528 {
529 tmp = (tmp >> 16u) + (tmp << 16u);
530 }
531
532#ifdef HAS_BIAS
533 mediump vec2 bias_vec = vec2(1.f, 1.f);
534 uint bias_u = packHalf2x16(bias_vec);
535 if(z_start_even)
536 {
537 tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u);
538 }
539 else
540 {
541 tmp = (bias_u & 0xffffu);
542 }
543#endif /* HAS_BIAS */
544
545 STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
546 }
547}
548
549#endif /* KERNEL_1x1 */
550#else /* DATA_TYPE_FP32 */
zhenglin57b20102018-01-05 14:39:50 +0800551#error Data type not supported
552#endif /* DATA_TYPE_FP32 */
553#endif /* IM2COL_GENERIC */
Anthony Barbier7068f992017-10-26 15:23:08 +0100554
555#ifdef IM2COL_REDUCED
Stephen Lie855c232018-01-04 14:13:22 +0800556
Anthony Barbier7068f992017-10-26 15:23:08 +0100557/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
558 *
zhenglin57b20102018-01-05 14:39:50 +0800559 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
Anthony Barbier7068f992017-10-26 15:23:08 +0100560 * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
561 *
zhenglin57b20102018-01-05 14:39:50 +0800562 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
563 * @param[in] src_attrs The attributes of the source tensor
564 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
565 * @param[in] dst_attrs The attributes of the destination tensor
566 * @param[in] width The width of the input tensor
567 * @param[in] height The height of the input tensor
Anthony Barbier7068f992017-10-26 15:23:08 +0100568 */
Stephen Lie855c232018-01-04 14:13:22 +0800569
zhenglin57b20102018-01-05 14:39:50 +0800570SHADER_PARAMS_DECLARATION
571{
572 Tensor3DAttributes src_attrs;
573 VectorAttributes dst_attrs;
574 uint width;
575 uint height;
576};
577
578#ifdef DATA_TYPE_FP32
Stephen Lie855c232018-01-04 14:13:22 +0800579
zhenglin57b20102018-01-05 14:39:50 +0800580TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
581TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
582
Anthony Barbier7068f992017-10-26 15:23:08 +0100583void main(void)
584{
zhenglin57b20102018-01-05 14:39:50 +0800585 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
586 VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100587
zhenglin57b20102018-01-05 14:39:50 +0800588 uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
589 uvec3 size = uvec3(gl_WorkGroupSize.xyz);
590 uint image_size = width * height;
591 uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x + pos.y * width + pos.z * image_size);
592
593 STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter));
Anthony Barbier7068f992017-10-26 15:23:08 +0100594
595#ifdef HAS_BIAS
596 // If it is the last thread in the 3 dimensional workgroup
597 if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
598 {
zhenglin57b20102018-01-05 14:39:50 +0800599 tmp_out_offset += (dst_attrs.stride_x >> uint(2));
600 STORE(dst_ptr, tmp_out_offset, 1.f);
Anthony Barbier7068f992017-10-26 15:23:08 +0100601 }
602#endif // HAS_BIAS
603}
zhenglin57b20102018-01-05 14:39:50 +0800604
605#elif defined(DATA_TYPE_FP16)
606
607#if defined(IM2COL_REDUCED_8X)
608TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
609TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, restrict);
610#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
611TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
612TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, restrict);
613#else /* IM2COL_REDUCED_8X */
614TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
615TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
616#endif /* IM2COL_REDUCED_8X */
617
618#if defined(IM2COL_REDUCED_GENERIC)
Stephen Lie855c232018-01-04 14:13:22 +0800619
zhenglin57b20102018-01-05 14:39:50 +0800620void main(void)
621{
622 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
623 Tensor3DIterator src_nostep_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
624 VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
625
626 uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
627 uvec3 size = uvec3(gl_WorkGroupSize.xyz);
628 uint image_size = width * height;
629 uint element_count = src_attrs.step_x / src_attrs.stride_x;
630 uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * element_count + pos.y * width + pos.z * image_size);
631 uint width_fp16 = (width + uint(1)) >> uint(1);
632 uint tmp;
633
634 // odd width
635 if(width % uint(2) != uint(0))
636 {
637 // even row
638 if((pos.y + pos.z * height) % uint(2) == uint(0))
639 {
steli014df05752018-01-30 09:49:07 +0800640 // skip last element of each line to avoid write conflict except for last line
641 if((pos.x < (width / element_count)) || ((pos.y == gl_NumWorkGroups.y - 1u) && (pos.z == gl_NumWorkGroups.z - 1u)))
642 {
643 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
644 STORE(dst_ptr, tmp_out_offset, tmp);
645 }
zhenglin57b20102018-01-05 14:39:50 +0800646 }
647 else
648 {
649 // special op
Stephen Lie855c232018-01-04 14:13:22 +0800650 uint tmp_left = uint(0);
651 uint tmp_right = uint(0);
652 tmp_right = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half
zhenglin57b20102018-01-05 14:39:50 +0800653 if(pos.x == uint(0))
654 {
Stephen Lie855c232018-01-04 14:13:22 +0800655 tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half
656 tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16));
zhenglin57b20102018-01-05 14:39:50 +0800657 }
658 else
659 {
Stephen Lie855c232018-01-04 14:13:22 +0800660 tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)));
661 tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16)));
zhenglin57b20102018-01-05 14:39:50 +0800662 }
Stephen Lie855c232018-01-04 14:13:22 +0800663 STORE(dst_ptr, tmp_out_offset, tmp_right);
zhenglin57b20102018-01-05 14:39:50 +0800664 }
665 }
666 else
667 {
668 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
669 STORE(dst_ptr, tmp_out_offset, tmp);
steli014df05752018-01-30 09:49:07 +0800670 }
zhenglin57b20102018-01-05 14:39:50 +0800671
672#ifdef HAS_BIAS
steli014df05752018-01-30 09:49:07 +0800673 // If it is the last thread in the 3 dimensional workgroup
674 if(pos.x == (size.x - 1u) && pos.y == (size.y - 1u) && pos.z == (size.z - 1u))
675 {
676 tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
zhenglin57b20102018-01-05 14:39:50 +0800677
steli014df05752018-01-30 09:49:07 +0800678 // FIXME: need odd/even detection for tmp_out_offset?
679 mediump vec2 bias_vec = vec2(1.0f, 1.0f);
680 STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
zhenglin57b20102018-01-05 14:39:50 +0800681 }
steli014df05752018-01-30 09:49:07 +0800682#endif // HAS_BIAS
zhenglin57b20102018-01-05 14:39:50 +0800683}
684
685#else /* IM2COL_REDUCED_GENERIC */
Stephen Lie855c232018-01-04 14:13:22 +0800686
zhenglin57b20102018-01-05 14:39:50 +0800687void main(void)
688{
689 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
690 VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
691
692 uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
693#if defined(IM2COL_REDUCED_8X)
694 uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE));
695 uvec4 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
696 STORE(dst_ptr, tmp_out_offset, tmp);
697#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
698 uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE));
699 uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
700 STORE(dst_ptr, tmp_out_offset, tmp);
701#else /* IM2COL_REDUCED_8X */
702 uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE));
703 uint tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
704 STORE(dst_ptr, tmp_out_offset, tmp);
705#endif /* IM2COL_REDUCED_8X */
706}
Stephen Lie855c232018-01-04 14:13:22 +0800707
708#endif /* IM2COL_REDUCED_GENERIC */
709#else /* DATA_TYPE_FP32 */
zhenglin57b20102018-01-05 14:39:50 +0800710#error Data type not supported
711#endif /* DATA_TYPE_FP32 */
712#endif /* IM2COL_REDUCED */
Anthony Barbier7068f992017-10-26 15:23:08 +0100713
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100714#ifdef COL2IM
Stephen Lie855c232018-01-04 14:13:22 +0800715#ifdef WIDTH_OUTPUT
716
Anthony Barbier7068f992017-10-26 15:23:08 +0100717/** This kernel performs a reshaping of the output of the convolution layer.
718 *
719 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
720 *
Stephen Lie855c232018-01-04 14:13:22 +0800721 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
722 * @param[in] src_attrs The attributes of the source tensor
723 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
724 * @param[in] dst_attrs The attributes of the destination tensor
725 * @param[in] dst_depth The length of the destination tensor in Z dimension
726 * @param[in] dst_strideZ The actual stride of the destination tensor in Z dimension
Anthony Barbier7068f992017-10-26 15:23:08 +0100727 */
Stephen Lie855c232018-01-04 14:13:22 +0800728
zhenglin57b20102018-01-05 14:39:50 +0800729SHADER_PARAMS_DECLARATION
730{
Stephen Lie855c232018-01-04 14:13:22 +0800731 Tensor3DAttributes src_attrs;
zhenglin57b20102018-01-05 14:39:50 +0800732 Tensor3DAttributes dst_attrs;
Stephen Lie855c232018-01-04 14:13:22 +0800733 uint dst_depth;
734 uint dst_strideZ;
zhenglin57b20102018-01-05 14:39:50 +0800735};
736
737#ifdef DATA_TYPE_FP32
Stephen Lie855c232018-01-04 14:13:22 +0800738
zhenglin57b20102018-01-05 14:39:50 +0800739TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
740TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
Stephen Lie855c232018-01-04 14:13:22 +0800741
Anthony Barbier7068f992017-10-26 15:23:08 +0100742void main(void)
743{
Stephen Lie855c232018-01-04 14:13:22 +0800744 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
zhenglin57b20102018-01-05 14:39:50 +0800745 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100746
Stephen Lie855c232018-01-04 14:13:22 +0800747 uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100748 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ);
Anthony Barbier7068f992017-10-26 15:23:08 +0100749
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100750 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
Anthony Barbier7068f992017-10-26 15:23:08 +0100751}
Anthony Barbier7068f992017-10-26 15:23:08 +0100752
zhenglin57b20102018-01-05 14:39:50 +0800753#elif defined(DATA_TYPE_FP16)
754
Stephen Lie855c232018-01-04 14:13:22 +0800755TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
756TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
757
758void main(void)
759{
760 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
761 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
762
763 uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
764
765 if((pos.z % dst_depth) % 2u == 0u)
766 {
767 uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ;
768 uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
769 uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
770 vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
771 vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
772 vec2 result = vec2(tmp1.x, tmp2.x);
773 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
774 }
775 else
776 {
777 uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u;
778 uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
779 uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
780 vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
781 vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
782 vec2 result = vec2(tmp1.y, tmp2.y);
783 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
784 }
785}
786
zhenglin57b20102018-01-05 14:39:50 +0800787#else /* DATA_TYPE_FP32 */
Anthony Barbier7068f992017-10-26 15:23:08 +0100788#error Data type not supported
zhenglin57b20102018-01-05 14:39:50 +0800789#endif /* DATA_TYPE_FP32 */
Michele Di Giorgiofc1d1e22018-04-10 14:24:35 +0100790#endif /* WIDTH_OUTPUT */
zhenglin57b20102018-01-05 14:39:50 +0800791#endif /* COL2IM */