blob: a36bd438ff38e6b739c3c9861c364f8159028db2 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
26
27#include "helpers.h"
28
ASIAPAC\steli0123ac91b2017-11-07 16:14:44 +080029#ifdef DATA_TYPE_FP32
30
31precision highp float;
32
33/** This kernel performs a direct convolution to convolve the low three dimensions
34 *
35 * @note This OpenGL ES shader works with stride_x = 1 and 2
36 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
37 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
38 *
39 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
40 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
41 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
42 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
43 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
44 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
45 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
46 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
47 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
48 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
49 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
50 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
51 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
52 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
53 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
54 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
55 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
56 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
57 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
58 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
59 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
60 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
61 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
62 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
63 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
64 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
65 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
66 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
67 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
68 * @param[in] weights_depth The third dimensions of the weights tensors
69 */
70
Anthony Barbier7068f992017-10-26 15:23:08 +010071layout(std140) uniform shader_params
72{
73 TENSOR3D_PARAM_DECLARATION(src);
74 TENSOR3D_PARAM_DECLARATION(dst);
75 TENSOR3D_PARAM_DECLARATION(weights);
76#ifdef BIAS
77 VECTOR_PARAM_DECLARATION(biases);
78#endif /* BIAS */
79 uint weights_stride_w;
80 uint weights_depth;
81};
82
Anthony Barbier7068f992017-10-26 15:23:08 +010083BUFFER_DECLARATION(src, 1, float, readonly);
84BUFFER_DECLARATION(dst, 2, float, writeonly);
85BUFFER_DECLARATION(weights, 3, float, readonly);
86#ifdef BIAS
87BUFFER_DECLARATION(biases, 4, float, readonly);
88#endif /* BIAS */
89
90#define LOAD20(r, name, offset) \
91 r[0] = LOAD4(name, offset); \
92 r[1] = LOAD4(name, offset + uint(1)); \
93 r[2] = LOAD4(name, offset + uint(2)); \
94 r[3] = LOAD4(name, offset + uint(3)); \
95 r[4] = LOAD4(name, offset + uint(4))
96
97/** This kernel performs a direct convolution to convolve the low three dimensions.
98 *
99 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
100 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
101 *
102 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
103 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
104 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
105 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
106 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
107 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
108 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
109 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
110 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
111 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
112 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
113 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
114 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
115 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
116 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
117 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800118 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Anthony Barbier7068f992017-10-26 15:23:08 +0100119 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
120 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
121 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
122 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
123 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
124 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
125 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
126 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
127 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
128 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
129 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
130 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
131 * @param[in] weights_depth The third dimensions of the weights tensors
132 */
133void main()
134{
135 Image src = CONVERT_TO_IMAGE_STRUCT(src);
136 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
137 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
138
139#ifdef BIAS
140 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
141#endif /* BIAS */
142
143 float pixels = CONVERT(0, float);
144 uint z_index = gl_GlobalInvocationID.z;
145 weights.current_offset += z_index * weights_stride_w >> 2;
146 float temp[5];
147 float temp_weight[5];
148
149 for(int d = 0; d < int(weights_depth); ++d)
150 {
151 LOAD20(temp, src, offset(src, 0, 0));
152 LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
153 pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
154
155 LOAD20(temp, src, offset(src, 0, 1));
156 LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
157 pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
158
159 LOAD20(temp, src, offset(src, 0, 2));
160 LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
161 pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
162
163 LOAD20(temp, src, offset(src, 0, 3));
164 LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
165 pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
166
167 LOAD20(temp, src, offset(src, 0, 4));
168 LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
169 pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
170
171 src.current_offset += (src_stride_z >> 2);
172 weights.current_offset += (weights_stride_z >> 2);
173 }
174
175#ifdef BIAS
176 pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
177#endif /* BIAS */
178
179 STORE4(dst, CURRENT_OFFSET(dst), pixels);
180}
181
182#elif defined(DATA_TYPE_FP16)
183
184precision mediump float;
185
ASIAPAC\steli0123ac91b2017-11-07 16:14:44 +0800186#if defined(PROCESS_4X_1Y_1Z)
187
188/** This kernel performs a direct convolution to convolve the low three dimensions
189 *
190 * @note This OpenGL ES shader works with stride_x = 1 and 2
191 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
192 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
193 *
194 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
195 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
196 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
197 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
198 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
199 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
200 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
201 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
202 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
203 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
204 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
205 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
206 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
207 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
208 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
209 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
210 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
211 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
212 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
213 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
214 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
215 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
216 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
217 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
218 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
219 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
220 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
221 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
222 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
223 * @param[in] weights_depth The third dimensions of the weights tensors
224 */
225
226layout(std140) uniform shader_params
227{
228 TENSOR3D_PARAM_DECLARATION(src);
229 TENSOR3D_PARAM_DECLARATION(dst);
230 TENSOR3D_PARAM_DECLARATION(weights);
231#ifdef BIAS
232 VECTOR_PARAM_DECLARATION(biases);
233#endif /* BIAS */
234 uint weights_stride_w;
235 uint weights_depth;
236};
237
Anthony Barbier7068f992017-10-26 15:23:08 +0100238BUFFER_DECLARATION(src, 1, uvec2, readonly);
239BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
240BUFFER_DECLARATION(weights, 3, uint, readonly);
241#ifdef BIAS
242BUFFER_DECLARATION(biases, 4, uint, readonly);
243#endif /* BIAS */
244
245#if STRIDE_X == 1
246#define LOAD_SRC(src, row) load_src_stride1(src, row)
247#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
248#elif STRIDE_X == 2 /* STRIDE_X == 1 */
249#define LOAD_SRC(src, row) load_src_stride2(src, row)
250#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
251#else /* STRDIDE_X == 1 */
252#error STRIDE_X larger than 2 is not supported
253#endif /* STRIDE_X == 1 */
254
255vec4[2] load_src_stride1(Image src, int row)
256{
257 uvec2 packed[2];
258 vec4 ret[2];
259
260 GC_LOAD2_2D_OFFSET(packed, src, 0, row);
261
262 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
263 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
264
265 return ret;
266}
267
268vec4[3] load_src_stride2(Image src, int row)
269{
270 uvec2 packed[3];
271 vec4 ret[3];
272
273 GC_LOAD3_2D_OFFSET(packed, src, 0, row);
274
275 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
276 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
277 ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
278
279 return ret;
280}
281
282vec2[3] load_weight(Tensor3D weights, int row)
283{
284 uvec3 packed_w;
285 vec2 ret[3];
286
287 GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
288
289 ret[0] = vec2(unpackHalf2x16(packed_w[0]));
290 ret[1] = vec2(unpackHalf2x16(packed_w[1]));
291 ret[2] = vec2(unpackHalf2x16(packed_w[2]));
292
293 return ret;
294}
295
Anthony Barbier7068f992017-10-26 15:23:08 +0100296vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
297{
298 vec4 src0 = tmp[0];
299 vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
300 vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
301 vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
302 vec4 src4 = tmp[1];
303 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
304
305 return ret;
306}
307
308vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
309{
310 vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
311 vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
312 vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
313 vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
314 vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
315 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
316
317 return ret;
318}
319
320/** This kernel performs a direct convolution to convolve the low three dimensions.
321 *
322 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
323 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
324 *
325 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
326 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
327 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
328 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
329 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
330 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
331 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
332 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
333 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
334 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
335 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
336 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
337 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
338 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
339 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
340 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800341 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Anthony Barbier7068f992017-10-26 15:23:08 +0100342 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
343 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
344 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
345 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
346 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
347 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
348 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
349 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
350 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
351 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
352 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
353 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
354 * @param[in] weights_depth The third dimensions of the weights tensors
355 */
356void main()
357{
358 Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
359 Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
360 Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
361
362#ifdef BIAS
363 Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
364#endif /* BIAS */
365
366 vec4 res = vec4(0);
367 vec2 w[3];
368 vec4 s[STRIDE_X + 1];
369 uvec2 packed_d;
370 uint z_index = gl_GlobalInvocationID.z;
371
372 weights.current_offset += z_index * weights_stride_w;
373
374 for(int d = 0; d < int(weights_depth); ++d)
375 {
376 for(int row = 0; row < 5; row++)
377 {
378 w = load_weight(weights, row);
379 s = LOAD_SRC(src, row);
380 res += CONVOLVE1x5(s, w);
381 }
382
383 src.current_offset += src_stride_z;
384 weights.current_offset += weights_stride_z;
385 }
386
387#ifdef BIAS
388 uint packed_b;
389 float b;
390
391 GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
392 b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
393 res += vec4(b);
394#endif /* BIAS */
395
396 packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
397 GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
398}
399
ASIAPAC\steli0123ac91b2017-11-07 16:14:44 +0800400#elif defined(PROCESS_4X_3Y_1Z)
401
402/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
403 *
404 * @note This OpenGL ES shader works with stride_x = 1 and 2
405 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
406 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
407 *
408 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
409 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
410 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
411 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
412 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
413 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
414 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
415 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
416 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
417 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
418 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
419 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
420 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
421 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
422 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
423 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
424 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
425 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
426 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
427 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
428 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
429 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
430 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
431 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
432 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
433 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
434 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
435 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
436 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
437 * @param[in] weights_depth The third dimensions of the weights tensors
438 */
439
440layout(std140) uniform shader_params
441{
442 TENSOR3D_PARAM_DECLARATION(src);
443 TENSOR3D_PARAM_DECLARATION(dst);
444 TENSOR3D_PARAM_DECLARATION(weights);
445#ifdef BIAS
446 VECTOR_PARAM_DECLARATION(biases);
447#endif /* BIAS */
448 uint weights_stride_w;
449 uint weights_depth;
450};
451
452BUFFER_DECLARATION(src, 1, uvec2, readonly);
453BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
454BUFFER_DECLARATION(weights, 3, uint, readonly);
455#ifdef BIAS
456BUFFER_DECLARATION(bias, 4, uint, readonly);
457#endif /* BIAS */
458
459#if STRIDE_X == 1
460#define LOAD_SRC(src, row) load_src_stride1(src, row)
461#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
462#elif STRIDE_X == 2 /* STRIDE_X == 1 */
463#define LOAD_SRC(src, row) load_src_stride2(src, row)
464#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
465#else /* STRDIDE_X == 1 */
466#error STRIDE_X larger than 2 is not supported
467#endif /* STRIDE_X == 1 */
468
469vec4[2] load_src_stride1(Image src, int row)
470{
471 uvec2 packed[2];
472 vec4 ret[2];
473
474 GC_LOAD2_2D_OFFSET(packed, src, 0, row);
475
476 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
477 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
478
479 return ret;
480}
481
482vec4[3] load_src_stride2(Image src, int row)
483{
484 uvec2 packed[3];
485 vec4 ret[3];
486
487 GC_LOAD3_2D_OFFSET(packed, src, 0, row);
488
489 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
490 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
491 ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
492
493 return ret;
494}
495
496vec2[3] load_weight(Tensor3D weights, int row)
497{
498 uvec3 packed_w;
499 vec2 ret[3];
500
501 GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
502
503 ret[0] = vec2(unpackHalf2x16(packed_w[0]));
504 ret[1] = vec2(unpackHalf2x16(packed_w[1]));
505 ret[2] = vec2(unpackHalf2x16(packed_w[2]));
506
507 return ret;
508}
509
510vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
511{
512 vec4 src0 = tmp[0];
513 vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
514 vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
515 vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
516 vec4 src4 = tmp[1];
517 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
518
519 return ret;
520}
521
522vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
523{
524 vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
525 vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
526 vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
527 vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
528 vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
529 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
530
531 return ret;
532}
533
534void main()
535{
536 Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
537 Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
538 Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
539
540#ifdef BIAS
541 Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
542#endif /* BIAS */
543
544 vec4 res[3];
545 vec2 w[5][3];
546 vec4 s[STRIDE_X + 1];
547 uvec2 packed_d;
548 uint z_index = gl_GlobalInvocationID.z;
549 int i;
550
551 for(i = 0; i < 3; i++)
552 {
553 res[i] = vec4(0);
554 }
555
556 weights.current_offset += z_index * weights_stride_w;
557
558 for(int d = 0; d < int(weights_depth); ++d)
559 {
560 // load weights once
561 for(int row = 0; row < 5; row++)
562 {
563 w[row] = load_weight(weights, row);
564 }
565
566 // 1st line
567 s = LOAD_SRC(src, 0);
568 res[0] += CONVOLVE1x5(s, w[0]);
569
570 // 2nd line
571 s = LOAD_SRC(src, 1);
572 res[0] += CONVOLVE1x5(s, w[1]);
573 res[1] += CONVOLVE1x5(s, w[0]);
574
575 // 3rd line
576 s = LOAD_SRC(src, 2);
577 res[0] += CONVOLVE1x5(s, w[2]);
578 res[1] += CONVOLVE1x5(s, w[1]);
579 res[2] += CONVOLVE1x5(s, w[0]);
580
581 // 4th line
582 s = LOAD_SRC(src, 3);
583 res[0] += CONVOLVE1x5(s, w[3]);
584 res[1] += CONVOLVE1x5(s, w[2]);
585 res[2] += CONVOLVE1x5(s, w[1]);
586
587 // 5th line
588 s = LOAD_SRC(src, 4);
589 res[0] += CONVOLVE1x5(s, w[4]);
590 res[1] += CONVOLVE1x5(s, w[3]);
591 res[2] += CONVOLVE1x5(s, w[2]);
592
593 // 6th line
594 s = LOAD_SRC(src, 5);
595 res[1] += CONVOLVE1x5(s, w[4]);
596 res[2] += CONVOLVE1x5(s, w[3]);
597
598 // 7th line
599 s = LOAD_SRC(src, 6);
600 res[2] += CONVOLVE1x5(s, w[4]);
601
602 src.current_offset += src_stride_z;
603 weights.current_offset += weights_stride_z;
604 }
605
606#ifdef BIAS
607 uint packed_b;
608 float b;
609
610 GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
611 b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
612 for(i = 0; i < 3; i++)
613 {
614 res[i] += vec4(b);
615 }
616#endif /* BIAS */
617
618 for(i = 0; i < 3; i++)
619 {
620 packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
621 GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
622 }
623}
624
625#elif defined(PROCESS_4X_3Y_2Z)
626
627/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
628 *
629 * @note This OpenGL ES shader works with stride_x = 1 and 2
630 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
631 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
632 *
633 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
634 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
635 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
636 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
637 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
638 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
639 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
640 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
641 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
642 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
643 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
644 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
645 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
646 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
647 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
648 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
649 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
650 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
651 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
652 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
653 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
654 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
655 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
656 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
657 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
658 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
659 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
660 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
661 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
662 * @param[in] weights_depth The third dimensions of the weights tensors
663 */
664
665layout(std140) uniform shader_params
666{
667 TENSOR3D_PARAM_DECLARATION(src);
668 TENSOR3D_PARAM_DECLARATION(dst);
669 TENSOR3D_PARAM_DECLARATION(weights);
670#ifdef BIAS
671 VECTOR_PARAM_DECLARATION(biases);
672#endif /* BIAS */
673 uint weights_stride_w;
674 uint weights_depth;
675};
676
677BUFFER_DECLARATION(src, 1, uvec2, readonly);
678BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
679BUFFER_DECLARATION(weights, 3, uint, readonly);
680#ifdef BIAS
681BUFFER_DECLARATION(bias, 4, uint, readonly);
682#endif /* BIAS */
683
684#if STRIDE_X == 1
685#define LOAD_SRC(src, row) load_src_stride1(src, row)
686#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
687#elif STRIDE_X == 2 /* STRIDE_X == 1 */
688#define LOAD_SRC(src, row) load_src_stride2(src, row)
689#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
690#else /* STRDIDE_X == 1 */
691#error STRIDE_X larger than 2 is not supported
692#endif /* STRIDE_X == 1 */
693
694vec4[2] load_src_stride1(Image src, int row)
695{
696 uvec2 packed[2];
697 vec4 ret[2];
698
699 GC_LOAD2_2D_OFFSET(packed, src, 0, row);
700
701 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
702 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
703
704 return ret;
705}
706
707vec4[3] load_src_stride2(Image src, int row)
708{
709 uvec2 packed[3];
710 vec4 ret[3];
711
712 GC_LOAD3_2D_OFFSET(packed, src, 0, row);
713
714 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
715 ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
716 ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
717
718 return ret;
719}
720
721vec2[3] load_weight(Tensor3D weights, int row)
722{
723 uvec3 packed_w;
724 vec2 ret[3];
725
726 GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
727
728 ret[0] = vec2(unpackHalf2x16(packed_w[0]));
729 ret[1] = vec2(unpackHalf2x16(packed_w[1]));
730 ret[2] = vec2(unpackHalf2x16(packed_w[2]));
731
732 return ret;
733}
734
735vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
736{
737 vec4 src0 = tmp[0];
738 vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
739 vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
740 vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
741 vec4 src4 = tmp[1];
742 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
743
744 return ret;
745}
746
747vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
748{
749 vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
750 vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
751 vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
752 vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
753 vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
754 vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
755
756 return ret;
757}
758
759void main()
760{
761 Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
762 Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
763 Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
764
765#ifdef BIAS
766 Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
767#endif /* BIAS */
768
769 vec4 res[3];
770 vec2 w[5][3];
771 vec4 s[STRIDE_X + 1];
772 uvec2 packed_d;
773 uint z_index = (gl_GlobalInvocationID.z);
774 uint s_offset = src.current_offset;
775 int i, z;
776
777 weights.current_offset += z_index * weights_stride_w;
778
779 for(z = 0; z < 2; z++)
780 {
781 z_index += uint(z);
782 src.current_offset = s_offset;
783
784 for(i = 0; i < 3; i++)
785 {
786 res[i] = vec4(0);
787 }
788
789 for(int d = 0; d < int(weights_depth); ++d)
790 {
791 // load weights once
792 for(int row = 0; row < 5; row++)
793 {
794 w[row] = load_weight(weights, row);
795 }
796
797 // 1st line
798 s = LOAD_SRC(src, 0);
799 res[0] += CONVOLVE1x5(s, w[0]);
800
801 // 2nd line
802 s = LOAD_SRC(src, 1);
803 res[0] += CONVOLVE1x5(s, w[1]);
804 res[1] += CONVOLVE1x5(s, w[0]);
805
806 // 3rd line
807 s = LOAD_SRC(src, 2);
808 res[0] += CONVOLVE1x5(s, w[2]);
809 res[1] += CONVOLVE1x5(s, w[1]);
810 res[2] += CONVOLVE1x5(s, w[0]);
811
812 // 4th line
813 s = LOAD_SRC(src, 3);
814 res[0] += CONVOLVE1x5(s, w[3]);
815 res[1] += CONVOLVE1x5(s, w[2]);
816 res[2] += CONVOLVE1x5(s, w[1]);
817
818 // 5th line
819 s = LOAD_SRC(src, 4);
820 res[0] += CONVOLVE1x5(s, w[4]);
821 res[1] += CONVOLVE1x5(s, w[3]);
822 res[2] += CONVOLVE1x5(s, w[2]);
823
824 // 6th line
825 s = LOAD_SRC(src, 5);
826 res[1] += CONVOLVE1x5(s, w[4]);
827 res[2] += CONVOLVE1x5(s, w[3]);
828
829 // 7th line
830 s = LOAD_SRC(src, 6);
831 res[2] += CONVOLVE1x5(s, w[4]);
832
833 src.current_offset += src_stride_z;
834 weights.current_offset += weights_stride_z;
835 }
836
837#ifdef BIAS
838 uint packed_b;
839 float b;
840
841 GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
842 b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
843 for(i = 0; i < 3; i++)
844 {
845 res[i] += vec4(b);
846 }
847#endif /* BIAS */
848
849 for(i = 0; i < 3; i++)
850 {
851 packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
852 GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
853 }
854
855 dst.current_offset += dst_stride_z;
856 }
857}
858
859#elif defined(PROCESS_8X_1Y_1Z)
860
861/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
862 *
863 * @note This OpenGL ES shader works with stride_x = 1
864 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
865 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
866 *
867 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
868 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
869 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
870 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
871 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
872 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
873 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
874 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
875 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
876 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
877 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
878 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
879 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
880 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
881 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
882 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
883 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
884 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
885 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
886 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
887 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
888 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
889 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
890 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
891 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
892 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
893 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
894 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
895 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
896 * @param[in] weights_depth The third dimensions of the weights tensors
897 */
898
899layout(std140) uniform shader_params
900{
901 TENSOR3D_PARAM_DECLARATION(src);
902 TENSOR3D_PARAM_DECLARATION(dst);
903 TENSOR3D_PARAM_DECLARATION(weights);
904#ifdef BIAS
905 VECTOR_PARAM_DECLARATION(biases);
906#endif /* BIAS */
907 uint weights_stride_w;
908 uint weights_depth;
909};
910
911BUFFER_DECLARATION(src, 1, uvec4, readonly);
912BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
913BUFFER_DECLARATION(weights, 3, uint, readonly);
914#ifdef BIAS
915BUFFER_DECLARATION(bias, 4, uint, readonly);
916#endif /* BIAS */
917
918#if STRIDE_X == 1
919#define LOAD_SRC(src, row) load_src_stride1(src, row)
920#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
921#elif STRIDE_X == 2 /* STRIDE_X == 1 */
922#error stride == 2 for PROCESS_8X_1Y not implemented
923#else /* STRDIDE_X == 1 */
924#error STRIDE_X larger than 2 is not supported
925#endif /* STRIDE_X == 1 */
926
927vec4[3] load_src_stride1(Image src, int row)
928{
929 uvec4 packed[2];
930 vec4 ret[3];
931
932 GC_LOAD2_2D_OFFSET(packed, src, 0, row);
933
934 ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
935 ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
936 ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
937
938 return ret;
939}
940
941vec2[3] load_weight(Tensor3D weights, int row)
942{
943 uvec3 packed_w;
944 vec2 ret[3];
945
946 GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
947
948 ret[0] = vec2(unpackHalf2x16(packed_w[0]));
949 ret[1] = vec2(unpackHalf2x16(packed_w[1]));
950 ret[2] = vec2(unpackHalf2x16(packed_w[2]));
951
952 return ret;
953}
954
955vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
956{
957 vec4 src0 = tmp[0];
958 vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
959 vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
960 vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
961 vec4 src4 = tmp[1];
962 vec4 ret[2];
963
964 ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
965
966 src0 = tmp[1];
967 src1 = vec4(tmp[1].yzw, tmp[2].x);
968 src2 = vec4(tmp[1].zw, tmp[2].xy);
969 src3 = vec4(tmp[1].w, tmp[2].xyz);
970 src4 = tmp[2];
971 ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
972
973 return ret;
974}
975
976void main()
977{
978 Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
979 Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
980 Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
981
982#ifdef BIAS
983 Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
984#endif /* BIAS */
985
986 vec4 res[2];
987 vec2 w[3];
988 vec4 s[STRIDE_X + 2];
989 uvec4 packed_d;
990 uint z_index = gl_GlobalInvocationID.z;
991
992 res[0] = vec4(0);
993 res[1] = vec4(0);
994 weights.current_offset += z_index * weights_stride_w;
995
996 for(int d = 0; d < int(weights_depth); ++d)
997 {
998 for(int row = 0; row < 5; row++)
999 {
1000 w = load_weight(weights, row);
1001 s = LOAD_SRC(src, row);
1002 res[0] += CONVOLVE1x5(s, w)[0];
1003 res[1] += CONVOLVE1x5(s, w)[1];
1004 }
1005
1006 src.current_offset += src_stride_z;
1007 weights.current_offset += weights_stride_z;
1008 }
1009
1010#ifdef BIAS
1011 uint packed_b;
1012 float b;
1013
1014 GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
1015 b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
1016 res[0] += vec4(b);
1017 res[1] += vec4(b);
1018#endif /* BIAS */
1019
1020 packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
1021 packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
1022 GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
1023}
1024
1025#else /* defined(PROCESS_4X_1Y_1Z) */
1026
1027#endif /* defined(PROCESS_4X_1Y_1Z) */
1028
Anthony Barbier7068f992017-10-26 15:23:08 +01001029#else /* DATA_TYPE_FP16 */
1030#error Data type not supported
1031#endif /* DATA_TYPE_FP16 */