blob: b42c09bbc7d240a0eaeaa4b0d637b0b5fb103a4e [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
Isabella Gottardi3f217ec2018-02-12 14:59:19 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier7068f992017-10-26 15:23:08 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
26
Joel Liangf1f3ebd2017-11-10 09:59:19 +080027#include "helpers_cs.h"
Anthony Barbier7068f992017-10-26 15:23:08 +010028
Isabella Gottardi3f217ec2018-02-12 14:59:19 +000029#ifdef FUSED_ACTIVATION
30#include "activation_layer_helpers_cs.h"
31#endif /* FUSED_ACTIVATION */
32
Joel Liangf1f3ebd2017-11-10 09:59:19 +080033#if defined(DATA_TYPE_FP16)
34precision mediump float;
35#endif // DATA_TYPE_FP16
36
37/** This kernel performs a direct convolution to convolve the low three dimensions.
38 *
39 * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
Xinghang Zhou4af62a02017-11-02 16:37:24 +080040 * @note This kernel has multiple optimized direct convolution options for FP16.
41 * The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
Joel Liangf1f3ebd2017-11-10 09:59:19 +080042 * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
43 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
44 *
45 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
46 * @param[in] src_attrs The attributes of the source tensor
47 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
48 * @param[in] dst_attrs The attributes of the destination tensor
49 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
50 * @param[in] weights_attrs The attributes of the weights tensor
51 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
52 * @param[in] biases_attrs The attributes of the weights tensor
53 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
54 * @param[in] weights_depth The third dimensions of the weights tensors
55 */
56SHADER_PARAMS_DECLARATION
Anthony Barbier7068f992017-10-26 15:23:08 +010057{
Joel Liangf1f3ebd2017-11-10 09:59:19 +080058 Tensor3DAttributes src_attrs;
59 Tensor3DAttributes dst_attrs;
60 Tensor3DAttributes weights_attrs;
Anthony Barbier7068f992017-10-26 15:23:08 +010061#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +080062 VectorAttributes biases_attrs;
Anthony Barbier7068f992017-10-26 15:23:08 +010063#endif /* BIAS */
64 uint weights_stride_w;
65 uint weights_depth;
66};
67
68#if defined(DATA_TYPE_FP32)
Joel Liangf1f3ebd2017-11-10 09:59:19 +080069TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
70TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
71TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +010072#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +080073TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +010074#endif /* BIAS */
75
Anthony Barbier7068f992017-10-26 15:23:08 +010076void main()
77{
Joel Liangf1f3ebd2017-11-10 09:59:19 +080078 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
79 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
80 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010081
82#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +080083 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010084#endif /* BIAS */
85
Joel Liangf1f3ebd2017-11-10 09:59:19 +080086 float pixels = 0.f;
Anthony Barbier7068f992017-10-26 15:23:08 +010087 uint z_index = gl_GlobalInvocationID.z;
Joel Liangf1f3ebd2017-11-10 09:59:19 +080088 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
89
Anthony Barbier7068f992017-10-26 15:23:08 +010090 float temp;
91 float temp_weight;
Anthony Barbier7068f992017-10-26 15:23:08 +010092 for(int d = 0; d < int(weights_depth); ++d)
93 {
Joel Liangf1f3ebd2017-11-10 09:59:19 +080094 temp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
95 temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +010096 pixels += temp * temp_weight;
97
Joel Liangf1f3ebd2017-11-10 09:59:19 +080098 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
99 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100100 }
101
102#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800103 pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100104#endif /* BIAS */
105
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000106#ifdef FUSED_ACTIVATION
107 pixels = ACT_OP(pixels);
108#endif /* FUSED_ACTIVATION */
109
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800110 STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100111}
Anthony Barbier7068f992017-10-26 15:23:08 +0100112
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800113#elif defined(DATA_TYPE_FP16)
114#if defined(PROCESS_4X_1Y_1Z)
115TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
116TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
117TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
118#ifdef BIAS
119TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
120#endif /* BIAS */
121
122#if STRIDE_X == 2
123#define CONVOLVE(s, w) convolve_stride2(s, w)
124#elif STRIDE_X == 1 /* STRIDE_X == 1 */
125#define CONVOLVE(s, w) convolve_stride1(s, w)
126#else /* STRIDE_X not equals 1 or 2 */
127#error STRIDE_X larger than 2 is not supported
128#endif /* STRIDE_X == 2 */
129
130vec4 convolve_stride1(ImageIterator src_iter, float w)
131{
132 vec4 s;
133 s = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
134
135 s *= w;
136
137 return s;
138}
139
140vec4 convolve_stride2(ImageIterator src_iter, float w)
141{
142 vec4 s[2];
143 vec4 r;
144
145 s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
146 s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
147 r = vec4(s[0].xz, s[1].xz);
148
149 r *= w;
150
151 return r;
152}
153
154void main()
155{
156 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
157 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
158 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
159
160#ifdef BIAS
161 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
162#endif /* BIAS */
163
164 vec4 pixels = vec4(0.f);
165
166 uint z_index = gl_GlobalInvocationID.z;
167 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
168
169#ifdef WEIGHTS_OPTIMIZATION
170 float w1, w2;
171 int nums = (int(weights_depth)) / 2;
172 for(int d = 0; d < nums; ++d)
173 {
174 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
175
176 w1 = vec2_w.x;
177 vec4 r1 = CONVOLVE(src_iter, w1);
178 pixels += r1;
179
180 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
181
182 w2 = vec2_w.y;
183 vec4 r2 = CONVOLVE(src_iter, w2);
184 pixels += r2;
185
186 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
187 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
188 }
189#else /* WEIGHTS_OPTIMIZATION */
190 float w;
191 for(int d = 0; d < int(weights_depth); ++d)
192 {
193 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
194
195 vec4 r = CONVOLVE(src_iter, w);
196 pixels += r;
197
198 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
199 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
200 }
201#endif /* WEIGHTS_OPTIMIZATION */
202
203#ifdef BIAS
204 vec2 vec2_b;
205 float b;
206
207 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
208
209 if(z_index % uint(2) == uint(0))
210 {
211 b = vec2_b.x;
212 }
213 else
214 {
215 b = vec2_b.y;
216 }
217
218 pixels += b;
219#endif /* BIAS */
220
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000221#ifdef FUSED_ACTIVATION
222 pixels = ACT_OP(pixels);
223#endif /* FUSED_ACTIVATION */
224
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800225 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
226}
227#elif defined(PROCESS_4X_2Y_1Z)
228TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
229TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
230TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
231#ifdef BIAS
232TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
233#endif /* BIAS */
234
235#if STRIDE_X == 2
236#define CONVOLVE(s, w) convolve_stride2(s, w)
237#elif STRIDE_X == 1 /* STRIDE_X == 1 */
238#define CONVOLVE(s, w) convolve_stride1(s, w)
239#else /* STRIDE_X not equals 1 or 2 */
240#error STRIDE_X larger than 2 is not supported
241#endif /* STRIDE_X == 2 */
242
243vec4[2] convolve_stride1(ImageIterator src_iter, float w)
244{
245 vec4 s[2];
246 s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
247 s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
248
249 s[0] *= w;
250 s[1] *= w;
251
252 return s;
253}
254
255vec4[2] convolve_stride2(ImageIterator src_iter, float w)
256{
257 vec4 s1[2];
258 vec4 s2[2];
259 vec4 r[2];
260
261 s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
262 s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
263 r[0] = vec4(s1[0].xz, s1[1].xz);
264
265 s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
266 s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
267 r[1] = vec4(s2[0].xz, s2[1].xz);
268
269 r[0] *= w;
270 r[1] *= w;
271
272 return r;
273}
274
275void main()
276{
277 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
278 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
279 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
280
281#ifdef BIAS
282 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
283#endif /* BIAS */
284
285 vec4 pixels[2];
286 pixels[0] = vec4(0.f);
287 pixels[1] = vec4(0.f);
288
289 uint z_index = gl_GlobalInvocationID.z;
290 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
291
292#ifdef WEIGHTS_OPTIMIZATION
293 float w1, w2;
294 int nums = (int(weights_depth)) / 2;
295 for(int d = 0; d < nums; ++d)
296 {
297 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
298
299 w1 = vec2_w.x;
300 vec4 r1[2] = CONVOLVE(src_iter, w1);
301 pixels[0] += r1[0];
302 pixels[1] += r1[1];
303
304 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
305
306 w2 = vec2_w.y;
307 vec4 r2[2] = CONVOLVE(src_iter, w2);
308 pixels[0] += r2[0];
309 pixels[1] += r2[1];
310
311 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
312 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
313 }
314#else /* WEIGHTS_OPTIMIZATION */
315 float w;
316 for(int d = 0; d < int(weights_depth); ++d)
317 {
318 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
319
320 vec4 r[2] = CONVOLVE(src_iter, w);
321 pixels[0] += r[0];
322 pixels[1] += r[1];
323
324 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
325 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
326 }
327#endif /* WEIGHTS_OPTIMIZATION */
328
329#ifdef BIAS
330 vec2 vec2_b;
331 float b;
332
333 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
334
335 if(z_index % uint(2) == uint(0))
336 {
337 b = vec2_b.x;
338 }
339 else
340 {
341 b = vec2_b.y;
342 }
343
344 pixels[0] += b;
345 pixels[1] += b;
346#endif /* BIAS */
347
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000348#ifdef FUSED_ACTIVATION
349 pixels[0] = ACT_OP(pixels[0]);
350 pixels[1] = ACT_OP(pixels[1]);
351#endif /* FUSED_ACTIVATION */
352
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800353 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
354 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
355}
356#elif defined(PROCESS_4X_3Y_1Z)
357TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
358TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
359TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
360#ifdef BIAS
361TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
362#endif /* BIAS */
363
364#if STRIDE_X == 2
365#define CONVOLVE(s, w) convolve_stride2(s, w)
366#elif STRIDE_X == 1 /* STRIDE_X == 1 */
367#define CONVOLVE(s, w) convolve_stride1(s, w)
368#else /* STRIDE_X not equals 1 or 2 */
369#error STRIDE_X larger than 2 is not supported
370#endif /* STRIDE_X == 2 */
371
372vec4[3] convolve_stride1(ImageIterator src_iter, float w)
373{
374 vec4 s[3];
375 s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
376 s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
377 s[2] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
378
379 s[0] *= w;
380 s[1] *= w;
381 s[2] *= w;
382
383 return s;
384}
385
386vec4[3] convolve_stride2(ImageIterator src_iter, float w)
387{
388 vec4 s1[2];
389 vec4 s2[2];
390 vec4 s3[2];
391 vec4 r[3];
392
393 s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
394 s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
395 r[0] = vec4(s1[0].xz, s1[1].xz);
396
397 s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
398 s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
399 r[1] = vec4(s2[0].xz, s2[1].xz);
400
401 s3[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
402 s3[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, (2 * int(STRIDE_Y))));
403 r[2] = vec4(s3[0].xz, s3[1].xz);
404
405 r[0] *= w;
406 r[1] *= w;
407 r[2] *= w;
408
409 return r;
410}
411
412void main()
413{
414 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
415 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
416 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
417
418#ifdef BIAS
419 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
420#endif /* BIAS */
421
422 vec4 pixels[3];
423 pixels[0] = vec4(0.f);
424 pixels[1] = vec4(0.f);
425 pixels[2] = vec4(0.f);
426
427 uint z_index = gl_GlobalInvocationID.z;
428 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
429
430#ifdef WEIGHTS_OPTIMIZATION
431 float w1, w2;
432 int nums = (int(weights_depth)) / 2;
433 for(int d = 0; d < nums; ++d)
434 {
435 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
436
437 w1 = vec2_w.x;
438 vec4 r1[3] = CONVOLVE(src_iter, w1);
439 pixels[0] += r1[0];
440 pixels[1] += r1[1];
441 pixels[2] += r1[2];
442
443 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
444
445 w2 = vec2_w.y;
446 vec4 r2[3] = CONVOLVE(src_iter, w2);
447 pixels[0] += r2[0];
448 pixels[1] += r2[1];
449 pixels[2] += r2[2];
450
451 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
452 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
453 }
454#else /* WEIGHTS_OPTIMIZATION */
455 float w;
456 for(int d = 0; d < int(weights_depth); ++d)
457 {
458 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
459
460 vec4 r[3] = CONVOLVE(src_iter, w);
461 pixels[0] += r[0];
462 pixels[1] += r[1];
463 pixels[2] += r[2];
464
465 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
466 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
467 }
468#endif /* WEIGHTS_OPTIMIZATION */
469
470#ifdef BIAS
471 vec2 vec2_b;
472 float b;
473
474 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
475
476 if(z_index % uint(2) == uint(0))
477 {
478 b = vec2_b.x;
479 }
480 else
481 {
482 b = vec2_b.y;
483 }
484
485 pixels[0] += b;
486 pixels[1] += b;
487 pixels[2] += b;
488#endif /* BIAS */
489
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000490#ifdef FUSED_ACTIVATION
491 pixels[0] = ACT_OP(pixels[0]);
492 pixels[1] = ACT_OP(pixels[1]);
493 pixels[2] = ACT_OP(pixels[2]);
494#endif /* FUSED_ACTIVATION */
495
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800496 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
497 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
498 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
499}
500#elif defined(PROCESS_4X_4Y_1Z)
501TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
502TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
503TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
504#ifdef BIAS
505TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
506#endif /* BIAS */
507
508#if STRIDE_X == 2
509#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
510#elif STRIDE_X == 1 /* STRIDE_X == 1 */
511#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
512#else /* STRIDE_X not equals 1 or 2 */
513#error STRIDE_X larger than 2 is not supported
514#endif /* STRIDE_X == 2 */
515
516vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
517{
518 vec4 s[2];
519 s[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
520 s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
521
522 s[0] *= w;
523 s[1] *= w;
524
525 return s;
526}
527
528vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
529{
530 vec4 s1[2];
531 vec4 s2[2];
532 vec4 r[2];
533
534 s1[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
535 s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), y1));
536 r[0] = vec4(s1[0].xz, s1[1].xz);
537
538 s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
539 s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), (y1 + int(STRIDE_Y))));
540 r[1] = vec4(s2[0].xz, s2[1].xz);
541
542 r[0] *= w;
543 r[1] *= w;
544
545 return r;
546}
547
548void main()
549{
550 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
551 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
552 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
553
554#ifdef BIAS
555 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
556#endif /* BIAS */
557
558 vec4 pixels[2];
559 vec4 pixels1[2];
560 pixels[0] = vec4(0.f);
561 pixels[1] = vec4(0.f);
562 pixels1[0] = vec4(0.f);
563 pixels1[1] = vec4(0.f);
564
565 uint z_index = gl_GlobalInvocationID.z;
566 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
567
568#ifdef WEIGHTS_OPTIMIZATION
569 float w1, w2;
570 int nums = (int(weights_depth)) / 2;
571 for(int d = 0; d < nums; ++d)
572 {
573 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
574
575 w1 = vec2_w.x;
576 vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
577 vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (2 * int(STRIDE_Y)));
578 pixels[0] += r1[0];
579 pixels[1] += r1[1];
580 pixels1[0] += r2[0];
581 pixels1[1] += r2[1];
582
583 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
584
585 w2 = vec2_w.y;
586 vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
587 vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (2 * int(STRIDE_Y)));
588 pixels[0] += r3[0];
589 pixels[1] += r3[1];
590 pixels1[0] += r4[0];
591 pixels1[1] += r4[1];
592
593 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
594 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
595 }
596#else /* WEIGHTS_OPTIMIZATION */
597 float w;
598 for(int d = 0; d < int(weights_depth); ++d)
599 {
600 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
601
602 vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
603 vec4 r2[2] = CONVOLVE(src_iter, w, 0, (2 * int(STRIDE_Y)));
604 pixels[0] += r1[0];
605 pixels[1] += r1[1];
606 pixels1[0] += r2[0];
607 pixels1[1] += r2[1];
608
609 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
610 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
611 }
612#endif /* WEIGHTS_OPTIMIZATION */
613
614#ifdef BIAS
615 vec2 vec2_b;
616 float b;
617
618 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
619
620 if(z_index % uint(2) == uint(0))
621 {
622 b = vec2_b.x;
623 }
624 else
625 {
626 b = vec2_b.y;
627 }
628
629 pixels[0] += b;
630 pixels[1] += b;
631 pixels1[0] += b;
632 pixels1[1] += b;
633#endif /* BIAS */
634
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000635#ifdef FUSED_ACTIVATION
636 pixels[0] = ACT_OP(pixels[0]);
637 pixels[1] = ACT_OP(pixels[1]);
638 pixels1[0] = ACT_OP(pixels1[0]);
639 pixels1[1] = ACT_OP(pixels1[1]);
640#endif /* FUSED_ACTIVATION */
641
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800642 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
643 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
644 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
645 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels1[1]);
646}
647#elif defined(PROCESS_4X_2Y_2Z)
648TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
649TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
650TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
651#ifdef BIAS
652TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
653#endif /* BIAS */
654
655#if STRIDE_X == 2
656#define CONVOLVE(s, w) convolve_stride2(s, w)
657#elif STRIDE_X == 1 /* STRIDE_X == 1 */
658#define CONVOLVE(s, w) convolve_stride1(s, w)
659#else /* STRIDE_X not equals 1 or 2 */
660#error STRIDE_X larger than 2 is not supported
661#endif /* STRIDE_X == 2 */
662
663vec4[2] convolve_stride1(ImageIterator src_iter, float w)
664{
665 vec4 s[2];
666 s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
667 s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
668
669 s[0] *= w;
670 s[1] *= w;
671
672 return s;
673}
674
675vec4[2] convolve_stride2(ImageIterator src_iter, float w)
676{
677 vec4 s1[2];
678 vec4 s2[2];
679 vec4 r[2];
680
681 s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
682 s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
683 r[0] = vec4(s1[0].xz, s1[1].xz);
684
685 s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
686 s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
687 r[1] = vec4(s2[0].xz, s2[1].xz);
688
689 r[0] *= w;
690 r[1] *= w;
691
692 return r;
693}
694
695void main()
696{
697 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
698 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
699 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
700
701#ifdef BIAS
702 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
703#endif /* BIAS */
704
705 uint z_base_index = uint(gl_GlobalInvocationID.z) << uint(1);
706
707 // store orginal src current offset
708 int s_offset_in_bytes = src_iter.current_offset_in_bytes;
709
710 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
711
712 for(int z = 0; z < 2; ++z)
713 {
714 uint z_index = z_base_index + uint(z);
715
716 src_iter.current_offset_in_bytes = s_offset_in_bytes;
717
718 vec4 pixels[2];
719 pixels[0] = vec4(0.f);
720 pixels[1] = vec4(0.f);
721
722#ifdef WEIGHTS_OPTIMIZATION
723 float w1, w2;
724 int nums = (int(weights_depth)) / 2;
725 for(int d = 0; d < nums; ++d)
726 {
727 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
728
729 w1 = vec2_w.x;
730 vec4 r1[2] = CONVOLVE(src_iter, w1);
731 pixels[0] += r1[0];
732 pixels[1] += r1[1];
733
734 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
735
736 w2 = vec2_w.y;
737 vec4 r2[2] = CONVOLVE(src_iter, w2);
738 pixels[0] += r2[0];
739 pixels[1] += r2[1];
740
741 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
742 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
743 }
744#else /* WEIGHTS_OPTIMIZATION */
745 float w;
746 for(int d = 0; d < int(weights_depth); ++d)
747 {
748 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
749
750 vec4 r[2] = CONVOLVE(src_iter, w);
751 pixels[0] += r[0];
752 pixels[1] += r[1];
753
754 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
755 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
756 }
757#endif /* WEIGHTS_OPTIMIZATION */
758
759#ifdef BIAS
760 vec2 vec2_b;
761 float b;
762
763 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
764
765 if(z_index % uint(2) == uint(0))
766 {
767 b = vec2_b.x;
768 }
769 else
770 {
771 b = vec2_b.y;
772 }
773
774 pixels[0] += b;
775 pixels[1] += b;
776#endif /* BIAS */
777
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000778#ifdef FUSED_ACTIVATION
779 pixels[0] = ACT_OP(pixels[0]);
780 pixels[1] = ACT_OP(pixels[1]);
781#endif /* FUSED_ACTIVATION */
782
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800783 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
784 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
785
786 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_z);
787 }
788}
789#elif defined(PROCESS_8X_1Y_1Z)
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800790TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
791TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
792TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100793#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800794TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100795#endif /* BIAS */
796
797#if STRIDE_X == 2
798#define CONVOLVE(s, w) convolve_stride2(s, w)
799#elif STRIDE_X == 1 /* STRIDE_X == 1 */
800#define CONVOLVE(s, w) convolve_stride1(s, w)
801#else /* STRIDE_X not equals 1 or 2 */
802#error STRIDE_X larger than 2 is not supported
803#endif /* STRIDE_X == 2 */
804
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800805vec4[2] convolve_stride1(ImageIterator src_iter, float w)
Anthony Barbier7068f992017-10-26 15:23:08 +0100806{
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800807 vec4 s[2];
808 s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100809
810 s[0] *= w;
811 s[1] *= w;
812
813 return s;
814}
815
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800816vec4[2] convolve_stride2(ImageIterator src_iter, float w)
Anthony Barbier7068f992017-10-26 15:23:08 +0100817{
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800818 vec4 s1[2];
819 vec4 s2[2];
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800820 vec4 r[2];
Anthony Barbier7068f992017-10-26 15:23:08 +0100821
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800822 s1 = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
823 r[0] = vec4(s1[0].xz, s1[1].xz);
824 s2 = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
825 r[1] = vec4(s2[0].xz, s2[1].xz);
Anthony Barbier7068f992017-10-26 15:23:08 +0100826
827 r[0] *= w;
828 r[1] *= w;
829
830 return r;
831}
832
Anthony Barbier7068f992017-10-26 15:23:08 +0100833void main()
834{
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800835 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
836 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
837 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100838
839#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800840 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100841#endif /* BIAS */
842
843 vec4 pixels[2];
844 pixels[0] = vec4(0.f);
845 pixels[1] = vec4(0.f);
846
847 uint z_index = gl_GlobalInvocationID.z;
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800848 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100849
zhenglin666635c2017-12-04 14:38:09 +0800850#ifdef WEIGHTS_OPTIMIZATION
851 float w1, w2;
852 int nums = (int(weights_depth)) / 2;
853 for(int d = 0; d < nums; ++d)
854 {
855 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
856
857 w1 = vec2_w.x;
858 vec4 r1[2] = CONVOLVE(src_iter, w1);
859 pixels[0] += r1[0];
860 pixels[1] += r1[1];
861
862 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
863
864 w2 = vec2_w.y;
865 vec4 r2[2] = CONVOLVE(src_iter, w2);
866 pixels[0] += r2[0];
867 pixels[1] += r2[1];
868
869 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
870 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
871 }
872#else /* WEIGHTS_OPTIMIZATION */
Anthony Barbier7068f992017-10-26 15:23:08 +0100873 float w;
Anthony Barbier7068f992017-10-26 15:23:08 +0100874 for(int d = 0; d < int(weights_depth); ++d)
875 {
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800876 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100877
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800878 vec4 r[2] = CONVOLVE(src_iter, w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100879 pixels[0] += r[0];
880 pixels[1] += r[1];
881
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800882 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
883 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100884 }
zhenglin666635c2017-12-04 14:38:09 +0800885#endif /* WEIGHTS_OPTIMIZATION */
Anthony Barbier7068f992017-10-26 15:23:08 +0100886
887#ifdef BIAS
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800888 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100889 float b;
890
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800891 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100892
893 if(z_index % uint(2) == uint(0))
894 {
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800895 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100896 }
897 else
898 {
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800899 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +0100900 }
901
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800902 pixels[0] += b;
903 pixels[1] += b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100904#endif /* BIAS */
905
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000906#ifdef FUSED_ACTIVATION
907 pixels[0] = ACT_OP(pixels[0]);
908 pixels[1] = ACT_OP(pixels[1]);
909#endif /* FUSED_ACTIVATION */
910
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800911 STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100912}
Xinghang Zhou4af62a02017-11-02 16:37:24 +0800913#elif defined(PROCESS_8X_2Y_1Z)
914TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
915TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
916TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
917#ifdef BIAS
918TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
919#endif /* BIAS */
920
921#if STRIDE_X == 2
922#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
923#elif STRIDE_X == 1 /* STRIDE_X == 1 */
924#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
925#else /* STRIDE_X not equals 1 or 2 */
926#error STRIDE_X larger than 2 is not supported
927#endif /* STRIDE_X == 2 */
928
929vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
930{
931 vec4 s[2];
932 s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
933
934 s[0] *= w;
935 s[1] *= w;
936
937 return s;
938}
939
940vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
941{
942 vec4 s1[2];
943 vec4 s2[2];
944 vec4 r[2];
945
946 s1 = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
947 r[0] = vec4(s1[0].xz, s1[1].xz);
948 s2 = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, (8 + x1), y1));
949 r[1] = vec4(s2[0].xz, s2[1].xz);
950
951 r[0] *= w;
952 r[1] *= w;
953
954 return r;
955}
956
957void main()
958{
959 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
960 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
961 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
962
963#ifdef BIAS
964 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
965#endif /* BIAS */
966
967 vec4 pixels[2];
968 vec4 pixels1[2];
969 pixels[0] = vec4(0.f);
970 pixels[1] = vec4(0.f);
971 pixels1[0] = vec4(0.f);
972 pixels1[1] = vec4(0.f);
973
974 uint z_index = gl_GlobalInvocationID.z;
975 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
976
977#ifdef WEIGHTS_OPTIMIZATION
978 float w1, w2;
979 int nums = (int(weights_depth)) / 2;
980 for(int d = 0; d < nums; ++d)
981 {
982 vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
983
984 w1 = vec2_w.x;
985 vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
986 vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (int(STRIDE_Y)));
987 pixels[0] += r1[0];
988 pixels[1] += r1[1];
989 pixels1[0] += r2[0];
990 pixels1[1] += r2[1];
991
992 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
993
994 w2 = vec2_w.y;
995 vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
996 vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (int(STRIDE_Y)));
997 pixels[0] += r3[0];
998 pixels[1] += r3[1];
999 pixels1[0] += r4[0];
1000 pixels1[1] += r4[1];
1001
1002 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
1003 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
1004 }
1005#else /* WEIGHTS_OPTIMIZATION */
1006 float w;
1007 for(int d = 0; d < int(weights_depth); ++d)
1008 {
1009 w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
1010
1011 vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
1012 vec4 r2[2] = CONVOLVE(src_iter, w, 0, (int(STRIDE_Y)));
1013 pixels[0] += r1[0];
1014 pixels[1] += r1[1];
1015 pixels1[0] += r2[0];
1016 pixels1[1] += r2[1];
1017
1018 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
1019 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
1020 }
1021#endif /* WEIGHTS_OPTIMIZATION */
1022
1023#ifdef BIAS
1024 vec2 vec2_b;
1025 float b;
1026
1027 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
1028
1029 if(z_index % uint(2) == uint(0))
1030 {
1031 b = vec2_b.x;
1032 }
1033 else
1034 {
1035 b = vec2_b.y;
1036 }
1037
1038 pixels[0] += b;
1039 pixels[1] += b;
1040 pixels1[0] += b;
1041 pixels1[1] += b;
1042#endif /* BIAS */
1043
Isabella Gottardi3f217ec2018-02-12 14:59:19 +00001044#ifdef FUSED_ACTIVATION
1045 pixels[0] = ACT_OP(pixels[0]);
1046 pixels[1] = ACT_OP(pixels[1]);
1047 pixels1[0] = ACT_OP(pixels1[0]);
1048 pixels1[1] = ACT_OP(pixels1[1]);
1049#endif /* FUSED_ACTIVATION */
1050
Xinghang Zhou4af62a02017-11-02 16:37:24 +08001051 STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
1052 STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
1053}
1054#endif /* PROCESS_4X_1Y_1Z */
Joel Liang63875432018-01-02 14:05:06 +08001055#else /* DATA_TYPE_FP32 */
Anthony Barbier7068f992017-10-26 15:23:08 +01001056#error Data type not supported
1057#endif /* DATA_TYPE_FP32 */