blob: e51cc3785af5a821ad3c09cc514dff09dda568eb [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
Isabella Gottardi3f217ec2018-02-12 14:59:19 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier7068f992017-10-26 15:23:08 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
25
Joel Liang63875432018-01-02 14:05:06 +080026#include "helpers_cs.h"
Anthony Barbier7068f992017-10-26 15:23:08 +010027
Isabella Gottardi3f217ec2018-02-12 14:59:19 +000028#ifdef FUSED_ACTIVATION
29#include "activation_layer_helpers_cs.h"
30#endif /* FUSED_ACTIVATION */
31
Joel Liang63875432018-01-02 14:05:06 +080032#if defined(DATA_TYPE_FP16)
33precision mediump float;
34#endif // DATA_TYPE_FP16
35
36/** This kernel performs a direct convolution to convolve the low three dimensions.
37 *
38 * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
39 * @note This kernel has multiple optimized direct convolution options for FP16.
40 * The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
41 * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
42 * This OpenGL ES shader works with stride_x = 1 and 2
43 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
44 *
45 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
46 * @param[in] src_attrs The attributes of the source tensor
47 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
48 * @param[in] dst_attrs The attributes of the destination tensor
49 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
50 * @param[in] weights_attrs The attributes of the weights tensor
51 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
52 * @param[in] biases_attrs The attributes of the weights tensor
53 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
54 * @param[in] weights_depth The third dimensions of the weights tensors
55 */
56SHADER_PARAMS_DECLARATION
Anthony Barbier7068f992017-10-26 15:23:08 +010057{
Joel Liang63875432018-01-02 14:05:06 +080058 Tensor3DAttributes src_attrs;
59 Tensor3DAttributes dst_attrs;
60 Tensor3DAttributes weights_attrs;
Anthony Barbier7068f992017-10-26 15:23:08 +010061#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +080062 VectorAttributes biases_attrs;
Anthony Barbier7068f992017-10-26 15:23:08 +010063#endif /* BIAS */
64 uint weights_stride_w;
65 uint weights_depth;
66};
67
Joel Liang63875432018-01-02 14:05:06 +080068#if defined(DATA_TYPE_FP32)
69#if defined(PROCESS_1X_1Y_1Z)
70TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
71TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
72TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +010073#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +080074TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +010075#endif /* BIAS */
76
Anthony Barbier7068f992017-10-26 15:23:08 +010077void main()
78{
Joel Liang63875432018-01-02 14:05:06 +080079 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
80 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
81 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010082
83#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +080084 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010085#endif /* BIAS */
86
Joel Liang63875432018-01-02 14:05:06 +080087 float pixels = 0.f;
Anthony Barbier7068f992017-10-26 15:23:08 +010088
89 uint z_index = gl_GlobalInvocationID.z;
90
Joel Liang63875432018-01-02 14:05:06 +080091 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +010092
93 for(int d = 0; d < int(weights_depth); ++d)
94 {
95 vec3 temp;
96 vec3 w;
97
Joel Liang63875432018-01-02 14:05:06 +080098 temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
99 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100100
101 pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
102
Joel Liang63875432018-01-02 14:05:06 +0800103 temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
104 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100105
106 pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
107
Joel Liang63875432018-01-02 14:05:06 +0800108 temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
109 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100110
111 pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
112
Joel Liang63875432018-01-02 14:05:06 +0800113 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
114 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100115 }
116
117#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800118 pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100119#endif /* BIAS */
120
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000121#ifdef FUSED_ACTIVATION
122 pixels = ACT_OP(pixels);
123#endif /* FUSED_ACTIVATION */
124
Joel Liang63875432018-01-02 14:05:06 +0800125 STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100126}
Joel Liang63875432018-01-02 14:05:06 +0800127
128#elif defined(PROCESS_8X_1Y_1Z)
129
130TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
131TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
132TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100133#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800134TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100135#endif /* BIAS */
136
137#if STRIDE_X == 2
138#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
139#elif STRIDE_X == 1 /* STRIDE_X == 1 */
140#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
141#else /* STRIDE_X not equals 1 or 2 */
142#error STRIDE_X larger than 2 is not supported
143#endif /* STRIDE_X == 2 */
144
145vec4[2] convolve1x3_stride1(uint offset, vec3 w)
146{
147 vec4 middle;
148 vec4 right;
149 vec4 tmp[3];
150 vec4 r[2];
151
Joel Liang63875432018-01-02 14:05:06 +0800152 tmp = VLOAD3(vec4[3], src_ptr, offset);
Anthony Barbier7068f992017-10-26 15:23:08 +0100153
154 middle = vec4(tmp[0].yzw, tmp[1].x);
155 right = vec4(tmp[0].zw, tmp[1].xy);
156
157 r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
158
159 middle = vec4(tmp[1].yzw, tmp[2].x);
160 right = vec4(tmp[1].zw, tmp[2].xy);
161
162 r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
163
164 return r;
165}
166
167vec4[2] convolve1x3_stride2(uint offset, vec3 w)
168{
169 vec4 left;
170 vec4 middle;
171 vec4 right;
Joel Liang63875432018-01-02 14:05:06 +0800172 vec4 tmp1[3];
173 vec4 tmp2[2];
Anthony Barbier7068f992017-10-26 15:23:08 +0100174 vec4 r[2];
175
Joel Liang63875432018-01-02 14:05:06 +0800176 tmp1 = VLOAD3(vec4[3], src_ptr, offset);
Anthony Barbier7068f992017-10-26 15:23:08 +0100177
Joel Liang63875432018-01-02 14:05:06 +0800178 left = vec4(tmp1[0].xz, tmp1[1].xz);
179 middle = vec4(tmp1[0].yw, tmp1[1].yw);
180 right = vec4(tmp1[0].z, tmp1[1].xz, tmp1[2].x);
Anthony Barbier7068f992017-10-26 15:23:08 +0100181
182 r[0] = left * w[0] + middle * w[1] + right * w[2];
183
Joel Liang63875432018-01-02 14:05:06 +0800184 tmp2 = VLOAD2(vec4[2], src_ptr, offset + uint(3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100185
Joel Liang63875432018-01-02 14:05:06 +0800186 left = vec4(tmp1[2].xz, tmp2[0].xz);
187 middle = vec4(tmp1[2].yw, tmp2[0].yw);
188 right = vec4(tmp1[2].z, tmp2[0].xz, tmp2[1].x);
Anthony Barbier7068f992017-10-26 15:23:08 +0100189
190 r[1] = left * w[0] + middle * w[1] + right * w[2];
191
192 return r;
193}
194
Anthony Barbier7068f992017-10-26 15:23:08 +0100195void main()
196{
Joel Liang63875432018-01-02 14:05:06 +0800197 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
198 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
199 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100200
201#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800202 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100203#endif /* BIAS */
204
205 vec4 pixels[2];
206 pixels[0] = vec4(0);
207 pixels[1] = vec4(0);
208
209 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800210 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100211
212 for(int d = 0; d < int(weights_depth); ++d)
213 {
214 // load 3 weights once
215 vec3 w;
216 vec4 r[2];
217
218 // first line
Joel Liang63875432018-01-02 14:05:06 +0800219 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100220
Joel Liang63875432018-01-02 14:05:06 +0800221 r = CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100222 pixels[0] += r[0];
223 pixels[1] += r[1];
224
225 // second line
Joel Liang63875432018-01-02 14:05:06 +0800226 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100227
Joel Liang63875432018-01-02 14:05:06 +0800228 r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100229 pixels[0] += r[0];
230 pixels[1] += r[1];
231
232 // third line
Joel Liang63875432018-01-02 14:05:06 +0800233 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100234
Joel Liang63875432018-01-02 14:05:06 +0800235 r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100236 pixels[0] += r[0];
237 pixels[1] += r[1];
238
Joel Liang63875432018-01-02 14:05:06 +0800239 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
240 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100241 }
242
243#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800244 float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100245 pixels[0] += vec4(b);
246 pixels[1] += vec4(b);
247#endif /* BIAS */
248
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000249#ifdef FUSED_ACTIVATION
250 pixels[0] = ACT_OP(pixels[0]);
251 pixels[1] = ACT_OP(pixels[1]);
252#endif /* FUSED_ACTIVATION */
253
Joel Liang63875432018-01-02 14:05:06 +0800254 VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100255}
Joel Liang63875432018-01-02 14:05:06 +0800256
257#elif defined(PROCESS_4X_1Y_1Z)
258
259TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
260TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
261TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100262#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800263TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100264#endif /* BIAS */
265
266#if STRIDE_X == 2
267#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
268#elif STRIDE_X == 1 /* STRIDE_X == 1 */
269#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
270#else /* STRIDE_X not equals 1 or 2 */
271#error STRIDE_X larger than 2 is not supported
272#endif /* STRIDE_X == 2 */
273
274vec4 convolve1x3_stride1(uint offset, vec3 w)
275{
276 vec4 tmp[2];
277 vec4 middle;
278 vec4 right;
279
Joel Liang63875432018-01-02 14:05:06 +0800280 tmp = VLOAD2(vec4[2], src_ptr, offset);
Anthony Barbier7068f992017-10-26 15:23:08 +0100281
282 middle = vec4(tmp[0].yzw, tmp[1].x);
283 right = vec4(tmp[0].zw, tmp[1].xy);
284
285 tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2];
286
287 return tmp[1];
288}
289
290vec4 convolve1x3_stride2(uint offset, vec3 w)
291{
292 vec4 left;
293 vec4 middle;
294 vec4 right;
295
296 vec4 tmp[3];
297
Joel Liang63875432018-01-02 14:05:06 +0800298 tmp = VLOAD3(vec4[3], src_ptr, offset);
Anthony Barbier7068f992017-10-26 15:23:08 +0100299
300 left = vec4(tmp[0].xz, tmp[1].xz);
301 middle = vec4(tmp[0].yw, tmp[1].yw);
302 right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
303
304 tmp[0] = left * w[0] + middle * w[1] + right * w[2];
305
306 return tmp[0];
307}
308
Anthony Barbier7068f992017-10-26 15:23:08 +0100309void main()
310{
Joel Liang63875432018-01-02 14:05:06 +0800311 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
312 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
313 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100314
315#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800316 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100317#endif /* BIAS */
318
319 vec4 pixels;
Joel Liang63875432018-01-02 14:05:06 +0800320 pixels = vec4(0.f);
Anthony Barbier7068f992017-10-26 15:23:08 +0100321
322 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800323 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100324
325 for(int d = 0; d < int(weights_depth); ++d)
326 {
327 // load 3 weights once
328 vec3 w;
329
330 // first line
Joel Liang63875432018-01-02 14:05:06 +0800331 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
332 pixels += CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100333
334 // second line
Joel Liang63875432018-01-02 14:05:06 +0800335 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
336 pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100337
338 // third line
Joel Liang63875432018-01-02 14:05:06 +0800339 w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
340 pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100341
Joel Liang63875432018-01-02 14:05:06 +0800342 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
343 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100344 }
345
346#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800347 float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
348 pixels += b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100349#endif /* BIAS */
350
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000351#ifdef FUSED_ACTIVATION
352 pixels = ACT_OP(pixels);
353#endif /* FUSED_ACTIVATION */
354
Joel Liang63875432018-01-02 14:05:06 +0800355 STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100356}
Joel Liang63875432018-01-02 14:05:06 +0800357
358#elif defined(PROCESS_4X_3Y_1Z)
359
360TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
361TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
362TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100363#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800364TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100365#endif /* BIAS */
366
367#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
368
369vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
370{
371 vec4 r;
372
373 r = left * w[0] + middle * w[1] + right * w[2];
374
375 return r;
376}
377
Anthony Barbier7068f992017-10-26 15:23:08 +0100378void main()
379{
Joel Liang63875432018-01-02 14:05:06 +0800380 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
381 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
382 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100383
384#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800385 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100386#endif /* BIAS */
387
388 vec4 pixels[3];
389 pixels[0] = vec4(0);
390 pixels[1] = vec4(0);
391 pixels[2] = vec4(0);
392
393 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800394 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100395
396 for(int d = 0; d < int(weights_depth); ++d)
397 {
398 // load 3 weights once
399 vec3 w[3];
400
Joel Liang63875432018-01-02 14:05:06 +0800401 w[0] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
402 w[1] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
403 w[2] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100404
405 vec4 s[2];
406 vec4 middle;
407 vec4 right;
408 // first line
Joel Liang63875432018-01-02 14:05:06 +0800409 s = VLOAD2_CURRENT_ITEM(vec4[2], src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100410 middle = vec4(s[0].yzw, s[1].x);
411 right = vec4(s[0].zw, s[1].xy);
412 pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
413
414 // second line
Joel Liang63875432018-01-02 14:05:06 +0800415 s = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100416 middle = vec4(s[0].yzw, s[1].x);
417 right = vec4(s[0].zw, s[1].xy);
418 pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
419 pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
420
421 // third line
Joel Liang63875432018-01-02 14:05:06 +0800422 s = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100423 middle = vec4(s[0].yzw, s[1].x);
424 right = vec4(s[0].zw, s[1].xy);
425 pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
426 pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]);
427 pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
428
429 // forth line
Joel Liang63875432018-01-02 14:05:06 +0800430 s = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100431 middle = vec4(s[0].yzw, s[1].x);
432 right = vec4(s[0].zw, s[1].xy);
433 pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
434 pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
435
436 // fifth line
Joel Liang63875432018-01-02 14:05:06 +0800437 s = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100438 middle = vec4(s[0].yzw, s[1].x);
439 right = vec4(s[0].zw, s[1].xy);
440 pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
441
Joel Liang63875432018-01-02 14:05:06 +0800442 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
443 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100444 }
445
446#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800447 float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100448
449 pixels[0] += vec4(b);
450 pixels[1] += vec4(b);
451 pixels[2] += vec4(b);
452#endif /* BIAS */
453
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000454#ifdef FUSED_ACTIVATION
455 pixels[0] = ACT_OP(pixels[0]);
456 pixels[1] = ACT_OP(pixels[1]);
457 pixels[2] = ACT_OP(pixels[2]);
458#endif /* FUSED_ACTIVATION */
459
Joel Liang63875432018-01-02 14:05:06 +0800460 STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
461 STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
462 STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
Anthony Barbier7068f992017-10-26 15:23:08 +0100463}
Anthony Barbier7068f992017-10-26 15:23:08 +0100464
Joel Liang63875432018-01-02 14:05:06 +0800465#endif // PROCESS_nX_nY
466
467#elif defined(DATA_TYPE_FP16)
468
469#if defined(PROCESS_8X_3Y_1Z)
470TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
471TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
472TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100473#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800474TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100475#endif /* BIAS */
476
477#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
478
479vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w)
480{
481 vec4 middle;
482 vec4 right;
483 vec4 r[2];
484
485 middle = vec4(tmp[0].yzw, tmp[1].x);
486 right = vec4(tmp[0].zw, tmp[1].xy);
487
488 r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
489
490 middle = vec4(tmp[1].yzw, tmp[2].x);
491 right = vec4(tmp[1].zw, tmp[2].xy);
492
493 r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
494
495 return r;
496}
497
Joel Liang63875432018-01-02 14:05:06 +0800498vec4[3] vload2_src_unpack12_half(uint offset)
Anthony Barbier7068f992017-10-26 15:23:08 +0100499{
500 uvec4 packed_s[2];
501 vec4 s[3];
502
Joel Liang63875432018-01-02 14:05:06 +0800503 packed_s = VLOAD2(uvec4[2], src_ptr, offset);
Anthony Barbier7068f992017-10-26 15:23:08 +0100504
505 s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
506 s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
507 s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
508
509 return s;
510}
511
Anthony Barbier7068f992017-10-26 15:23:08 +0100512void main()
513{
Joel Liang63875432018-01-02 14:05:06 +0800514 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
515 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
516 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100517
518#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800519 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100520#endif /* BIAS */
521
Anthony Barbier7068f992017-10-26 15:23:08 +0100522 vec4 pixels[3][2];
523 int i, j;
524 for(i = 0; i < 3; i++)
525 {
526 for(j = 0; j < 2; j++)
527 {
528 pixels[i][j] = vec4(0);
529 }
530 }
531
532 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800533 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100534
535 for(int d = 0; d < int(weights_depth); ++d)
536 {
537 // load 3 weights once
538 uvec2 packed_w[3];
539
Joel Liang63875432018-01-02 14:05:06 +0800540 packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
541 packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
542 packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100543
544 vec3 w[3];
545 w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
546 w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
547 w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
548
549 uvec4 packed_s[2];
550 vec4 s[3];
551 vec4 r[2];
Joel Liang63875432018-01-02 14:05:06 +0800552
Anthony Barbier7068f992017-10-26 15:23:08 +0100553 // first line
Joel Liang63875432018-01-02 14:05:06 +0800554 s = vload2_src_unpack12_half(CURRENT_ITEM_OFFSET(src_iter));
Anthony Barbier7068f992017-10-26 15:23:08 +0100555
556 r = CONVOLVE1x3(s, w[0]);
557 pixels[0][0] += r[0];
558 pixels[0][1] += r[1];
559
560 // second line
Joel Liang63875432018-01-02 14:05:06 +0800561 s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100562
563 r = CONVOLVE1x3(s, w[1]);
564 pixels[0][0] += r[0];
565 pixels[0][1] += r[1];
566 r = CONVOLVE1x3(s, w[0]);
567 pixels[1][0] += r[0];
568 pixels[1][1] += r[1];
569
570 // third line
Joel Liang63875432018-01-02 14:05:06 +0800571 s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100572
573 r = CONVOLVE1x3(s, w[2]);
574 pixels[0][0] += r[0];
575 pixels[0][1] += r[1];
576 r = CONVOLVE1x3(s, w[1]);
577 pixels[1][0] += r[0];
578 pixels[1][1] += r[1];
579 r = CONVOLVE1x3(s, w[0]);
580 pixels[2][0] += r[0];
581 pixels[2][1] += r[1];
582
583 // forth line
Joel Liang63875432018-01-02 14:05:06 +0800584 s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100585
586 r = CONVOLVE1x3(s, w[2]);
587 pixels[1][0] += r[0];
588 pixels[1][1] += r[1];
589 r = CONVOLVE1x3(s, w[1]);
590 pixels[2][0] += r[0];
591 pixels[2][1] += r[1];
592
593 // fifth line
Joel Liang63875432018-01-02 14:05:06 +0800594 s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100595
596 r = CONVOLVE1x3(s, w[2]);
597 pixels[2][0] += r[0];
598 pixels[2][1] += r[1];
599
Joel Liang63875432018-01-02 14:05:06 +0800600 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
601 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100602 }
603
604#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800605 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100606 float b;
Joel Liang63875432018-01-02 14:05:06 +0800607 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100608
609 if(z_index % uint(2) == uint(0))
610 {
Joel Liang63875432018-01-02 14:05:06 +0800611 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100612 }
613 else
614 {
Joel Liang63875432018-01-02 14:05:06 +0800615 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +0100616 }
617
618 for(i = 0; i < 3; i++)
619 {
620 for(j = 0; j < 2; j++)
621 {
622 pixels[i][j] += vec4(b);
623 }
624 }
625#endif /* BIAS */
626
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000627#ifdef FUSED_ACTIVATION
628 pixels[0] = ACT_OP(pixels[0]);
629 pixels[1] = ACT_OP(pixels[1]);
630 pixels[2] = ACT_OP(pixels[2]);
631#endif /* FUSED_ACTIVATION */
632
Joel Liang63875432018-01-02 14:05:06 +0800633 STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
634 STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
635 STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
Anthony Barbier7068f992017-10-26 15:23:08 +0100636}
Anthony Barbier7068f992017-10-26 15:23:08 +0100637
Joel Liang63875432018-01-02 14:05:06 +0800638#elif defined(PROCESS_4X_1Y_1Z)
639TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
640TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
641TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100642#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800643TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100644#endif /* BIAS */
645
646#if STRIDE_X == 2
647#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
Joel Liang63875432018-01-02 14:05:06 +0800648#define LOAD_AND_UNPACK(offset) VLOAD3_UNPACK12_HALF(src_ptr, offset)
Anthony Barbier7068f992017-10-26 15:23:08 +0100649#elif STRIDE_X == 1 /* STRIDE_X == 1 */
650#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
Joel Liang63875432018-01-02 14:05:06 +0800651#define LOAD_AND_UNPACK(offset) VLOAD2_UNPACK8_HALF(src_ptr, offset)
Anthony Barbier7068f992017-10-26 15:23:08 +0100652#else /* STRIDE_X not equals 1 or 2 */
653#error STRIDE_X larger than 2 is not supported
654#endif /* STRIDE_X == 2 */
655
656vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
657{
658 vec4 middle;
659 vec4 right;
660 vec4 r;
661
662 middle = vec4(tmp[0].yzw, tmp[1].x);
663 right = vec4(tmp[0].zw, tmp[1].xy);
664
665 r = tmp[0] * w[0] + middle * w[1] + right * w[2];
666
667 return r;
668}
669
670vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w)
671{
672 vec4 left;
673 vec4 middle;
674 vec4 right;
675 vec4 r;
676
677 left = vec4(tmp[0].xz, tmp[1].xz);
678 middle = vec4(tmp[0].yw, tmp[1].yw);
679 right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
680
681 r = left * w[0] + middle * w[1] + right * w[2];
682
683 return r;
684}
685
Anthony Barbier7068f992017-10-26 15:23:08 +0100686void main()
687{
Joel Liang63875432018-01-02 14:05:06 +0800688 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
689 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
690 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100691
692#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800693 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100694#endif /* BIAS */
695
696 uvec2 packed_d;
697
698 vec4 pixels = vec4(0);
699
700 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800701 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100702
703 for(int d = 0; d < int(weights_depth); ++d)
704 {
705 // load 3 weights once
706 uvec2 packed_w[3];
707
Joel Liang63875432018-01-02 14:05:06 +0800708 packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
709 packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
710 packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100711
712 vec3 w[3];
713 w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
714 w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
715 w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
716
717#if STRIDE_X == 2
718 vec4 s[3];
719#elif STRIDE_X == 1 /* STRIDE_X == 1 */
720 vec4 s[2];
721#else /* STRIDE_X not equals 1 or 2 */
722#error STRIDE_X larger than 2 is not supported
723#endif /* STRIDE_X == 2 */
724 vec4 r;
Anthony Barbier7068f992017-10-26 15:23:08 +0100725
Joel Liang63875432018-01-02 14:05:06 +0800726 // first line
727 s = LOAD_AND_UNPACK(CURRENT_ITEM_OFFSET(src_iter));
Anthony Barbier7068f992017-10-26 15:23:08 +0100728 pixels += CONVOLVE1x3(s, w[0]);
729
730 // second line
Joel Liang63875432018-01-02 14:05:06 +0800731 s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100732 pixels += CONVOLVE1x3(s, w[1]);
733
734 // third line
Joel Liang63875432018-01-02 14:05:06 +0800735 s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100736 pixels += CONVOLVE1x3(s, w[2]);
737
Joel Liang63875432018-01-02 14:05:06 +0800738 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
739 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100740 }
741
742#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800743 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100744 float b;
Joel Liang63875432018-01-02 14:05:06 +0800745
746 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100747
748 if(z_index % uint(2) == uint(0))
749 {
Joel Liang63875432018-01-02 14:05:06 +0800750 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100751 }
752 else
753 {
Joel Liang63875432018-01-02 14:05:06 +0800754 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +0100755 }
756
757 pixels += vec4(b);
758#endif /* BIAS */
759
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000760#ifdef FUSED_ACTIVATION
761 pixels = ACT_OP(pixels);
762#endif /* FUSED_ACTIVATION */
763
Joel Liang63875432018-01-02 14:05:06 +0800764 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
Anthony Barbier7068f992017-10-26 15:23:08 +0100765}
Anthony Barbier7068f992017-10-26 15:23:08 +0100766
Joel Liang63875432018-01-02 14:05:06 +0800767#elif defined(PROCESS_4X_3Y_1Z)
768TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
769TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
770TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100771#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800772TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100773#endif /* BIAS */
774
775#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
776
777vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
778{
779 vec4 middle;
780 vec4 right;
781 vec4 r;
782
783 middle = vec4(tmp[0].yzw, tmp[1].x);
784 right = vec4(tmp[0].zw, tmp[1].xy);
785
786 r = tmp[0] * w[0] + middle * w[1] + right * w[2];
787
788 return r;
789}
790
Anthony Barbier7068f992017-10-26 15:23:08 +0100791void main()
792{
Joel Liang63875432018-01-02 14:05:06 +0800793 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
794 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
795 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100796
797#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800798 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100799#endif /* BIAS */
800
Anthony Barbier7068f992017-10-26 15:23:08 +0100801 vec4 pixels[3];
802 int i;
803
804 for(i = 0; i < 3; i++)
805 {
806 pixels[i] = vec4(0);
807 }
808
809 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800810 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100811
812 for(int d = 0; d < int(weights_depth); ++d)
813 {
814 // load 3 weights once
815 uvec2 packed_w[3];
816
Joel Liang63875432018-01-02 14:05:06 +0800817 packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
818 packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
819 packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100820
821 vec3 w[3];
822 w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
823 w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
824 w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
825
826 vec4 s[2];
827 vec4 r;
Anthony Barbier7068f992017-10-26 15:23:08 +0100828
Joel Liang63875432018-01-02 14:05:06 +0800829 // first line
830 s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100831 pixels[0] += CONVOLVE1x3(s, w[0]);
832
833 // second line
Joel Liang63875432018-01-02 14:05:06 +0800834 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100835 pixels[0] += CONVOLVE1x3(s, w[1]);
836 pixels[1] += CONVOLVE1x3(s, w[0]);
837
838 // third line
Joel Liang63875432018-01-02 14:05:06 +0800839 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100840 pixels[0] += CONVOLVE1x3(s, w[2]);
841 pixels[1] += CONVOLVE1x3(s, w[1]);
842 pixels[2] += CONVOLVE1x3(s, w[0]);
843
844 // forth line
Joel Liang63875432018-01-02 14:05:06 +0800845 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100846 pixels[1] += CONVOLVE1x3(s, w[2]);
847 pixels[2] += CONVOLVE1x3(s, w[1]);
848
849 // fifth line
Joel Liang63875432018-01-02 14:05:06 +0800850 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100851 pixels[2] += CONVOLVE1x3(s, w[2]);
852
Joel Liang63875432018-01-02 14:05:06 +0800853 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
854 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100855 }
856
857#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800858 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100859 float b;
Joel Liang63875432018-01-02 14:05:06 +0800860 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100861
862 if(z_index % uint(2) == uint(0))
863 {
Joel Liang63875432018-01-02 14:05:06 +0800864 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100865 }
866 else
867 {
Joel Liang63875432018-01-02 14:05:06 +0800868 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +0100869 }
870
871 for(i = 0; i < 3; i++)
872 {
873 pixels[i] += vec4(b);
874 }
875#endif /* BIAS */
876
Isabella Gottardi3f217ec2018-02-12 14:59:19 +0000877#ifdef FUSED_ACTIVATION
878 pixels[0] = ACT_OP(pixels[0]);
879 pixels[1] = ACT_OP(pixels[1]);
880 pixels[2] = ACT_OP(pixels[2]);
881#endif /* FUSED_ACTIVATION */
882
Joel Liang63875432018-01-02 14:05:06 +0800883 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
884 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
885 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
Anthony Barbier7068f992017-10-26 15:23:08 +0100886}
Anthony Barbier7068f992017-10-26 15:23:08 +0100887
Joel Liang63875432018-01-02 14:05:06 +0800888#elif defined(PROCESS_4X_4Y_1Z)
889TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
890TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
891TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100892#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800893TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100894#endif /* BIAS */
895
896#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
897
898vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
899{
900 vec4 middle;
901 vec4 right;
902 vec4 r;
903
904 middle = vec4(tmp[0].yzw, tmp[1].x);
905 right = vec4(tmp[0].zw, tmp[1].xy);
906
907 r = tmp[0] * w[0] + middle * w[1] + right * w[2];
908
909 return r;
910}
911
Anthony Barbier7068f992017-10-26 15:23:08 +0100912void main()
913{
Joel Liang63875432018-01-02 14:05:06 +0800914 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
915 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
916 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100917
918#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800919 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100920#endif /* BIAS */
921
Anthony Barbier7068f992017-10-26 15:23:08 +0100922 vec4 pixels[4];
923 int i;
924
925 for(i = 0; i < 4; i++)
926 {
927 pixels[i] = vec4(0);
928 }
929
930 uint z_index = gl_GlobalInvocationID.z;
Joel Liang63875432018-01-02 14:05:06 +0800931 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +0100932
933 for(int d = 0; d < int(weights_depth); ++d)
934 {
935 // load 3 weights once
936 uvec2 packed_w[3];
937
Joel Liang63875432018-01-02 14:05:06 +0800938 packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
939 packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
940 packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100941
942 vec3 w[3];
943 w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
944 w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
945 w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
946
947 vec4 s[2];
948 vec4 r;
Anthony Barbier7068f992017-10-26 15:23:08 +0100949
Joel Liang63875432018-01-02 14:05:06 +0800950 // first line
951 s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100952 pixels[0] += CONVOLVE1x3(s, w[0]);
953
954 // second line
Joel Liang63875432018-01-02 14:05:06 +0800955 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100956 pixels[0] += CONVOLVE1x3(s, w[1]);
957 pixels[1] += CONVOLVE1x3(s, w[0]);
958
959 // third line
Joel Liang63875432018-01-02 14:05:06 +0800960 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100961 pixels[0] += CONVOLVE1x3(s, w[2]);
962 pixels[1] += CONVOLVE1x3(s, w[1]);
963 pixels[2] += CONVOLVE1x3(s, w[0]);
964
965 // forth line
Joel Liang63875432018-01-02 14:05:06 +0800966 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100967 pixels[1] += CONVOLVE1x3(s, w[2]);
968 pixels[2] += CONVOLVE1x3(s, w[1]);
969 pixels[3] += CONVOLVE1x3(s, w[0]);
970
971 // fifth line
Joel Liang63875432018-01-02 14:05:06 +0800972 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100973 pixels[2] += CONVOLVE1x3(s, w[2]);
974 pixels[3] += CONVOLVE1x3(s, w[1]);
975
976 // sixth line
Joel Liang63875432018-01-02 14:05:06 +0800977 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 5));
Anthony Barbier7068f992017-10-26 15:23:08 +0100978 pixels[3] += CONVOLVE1x3(s, w[2]);
979
Joel Liang63875432018-01-02 14:05:06 +0800980 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
981 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +0100982 }
983
984#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +0800985 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +0100986 float b;
Joel Liang63875432018-01-02 14:05:06 +0800987 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +0100988
989 if(z_index % uint(2) == uint(0))
990 {
Joel Liang63875432018-01-02 14:05:06 +0800991 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +0100992 }
993 else
994 {
Joel Liang63875432018-01-02 14:05:06 +0800995 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +0100996 }
997
998 for(i = 0; i < 4; i++)
999 {
1000 pixels[i] += vec4(b);
1001 }
1002#endif /* BIAS */
1003
Isabella Gottardi3f217ec2018-02-12 14:59:19 +00001004#ifdef FUSED_ACTIVATION
1005 pixels[0] = ACT_OP(pixels[0]);
1006 pixels[1] = ACT_OP(pixels[1]);
1007 pixels[2] = ACT_OP(pixels[2]);
1008 pixels[3] = ACT_OP(pixels[3]);
1009#endif /* FUSED_ACTIVATION */
1010
Joel Liang63875432018-01-02 14:05:06 +08001011 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
1012 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
1013 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
1014 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels[3]);
Anthony Barbier7068f992017-10-26 15:23:08 +01001015}
Joel Liang63875432018-01-02 14:05:06 +08001016#elif defined(PROCESS_4X_3Y_2Z)
1017TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
1018TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
1019TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +01001020#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +08001021TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
Anthony Barbier7068f992017-10-26 15:23:08 +01001022#endif /* BIAS */
1023
1024#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
1025
1026vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
1027{
1028 vec4 middle;
1029 vec4 right;
1030 vec4 r;
1031
1032 middle = vec4(tmp[0].yzw, tmp[1].x);
1033 right = vec4(tmp[0].zw, tmp[1].xy);
1034
1035 r = tmp[0] * w[0] + middle * w[1] + right * w[2];
1036
1037 return r;
1038}
1039
Anthony Barbier7068f992017-10-26 15:23:08 +01001040void main()
1041{
Joel Liang63875432018-01-02 14:05:06 +08001042 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
1043 Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
1044 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +01001045
1046#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +08001047 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +01001048#endif /* BIAS */
1049
Anthony Barbier7068f992017-10-26 15:23:08 +01001050 vec4 pixels[3];
1051 int i;
1052
1053 uint z_base_index = gl_GlobalInvocationID.z << 1;
1054
1055 // store orginal src current offset
Joel Liang63875432018-01-02 14:05:06 +08001056 uint s_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(srcc_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +01001057
Joel Liang63875432018-01-02 14:05:06 +08001058 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
Anthony Barbier7068f992017-10-26 15:23:08 +01001059
1060 for(int z = 0; z < 2; ++z)
1061 {
1062 uint z_index = z_base_index + uint(z);
1063
Joel Liang63875432018-01-02 14:05:06 +08001064 SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, s_offset_in_bytes);
Anthony Barbier7068f992017-10-26 15:23:08 +01001065
1066 for(i = 0; i < 3; i++)
1067 {
1068 pixels[i] = vec4(0);
1069 }
1070
1071 for(int d = 0; d < int(weights_depth); ++d)
1072 {
1073 // load 3 weights once
1074 uvec2 packed_w[3];
1075
Joel Liang63875432018-01-02 14:05:06 +08001076 packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
1077 packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
1078 packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +01001079
1080 vec3 w[3];
1081 w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
1082 w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
1083 w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
1084
1085 vec4 s[2];
1086 vec4 r;
Anthony Barbier7068f992017-10-26 15:23:08 +01001087
Joel Liang63875432018-01-02 14:05:06 +08001088 // first line
1089 s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +01001090 pixels[0] += CONVOLVE1x3(s, w[0]);
1091
1092 // second line
Joel Liang63875432018-01-02 14:05:06 +08001093 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +01001094 pixels[0] += CONVOLVE1x3(s, w[1]);
1095 pixels[1] += CONVOLVE1x3(s, w[0]);
1096
1097 // third line
Joel Liang63875432018-01-02 14:05:06 +08001098 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +01001099 pixels[0] += CONVOLVE1x3(s, w[2]);
1100 pixels[1] += CONVOLVE1x3(s, w[1]);
1101 pixels[2] += CONVOLVE1x3(s, w[0]);
1102
1103 // forth line
Joel Liang63875432018-01-02 14:05:06 +08001104 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +01001105 pixels[1] += CONVOLVE1x3(s, w[2]);
1106 pixels[2] += CONVOLVE1x3(s, w[1]);
1107
1108 // fifth line
Joel Liang63875432018-01-02 14:05:06 +08001109 s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +01001110 pixels[2] += CONVOLVE1x3(s, w[2]);
1111
Joel Liang63875432018-01-02 14:05:06 +08001112 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
1113 TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +01001114 }
1115
1116#ifdef BIAS
Joel Liang63875432018-01-02 14:05:06 +08001117 vec2 vec2_b;
Anthony Barbier7068f992017-10-26 15:23:08 +01001118 float b;
Joel Liang63875432018-01-02 14:05:06 +08001119 vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
Anthony Barbier7068f992017-10-26 15:23:08 +01001120
1121 if(z_index % uint(2) == uint(0))
1122 {
Joel Liang63875432018-01-02 14:05:06 +08001123 b = vec2_b.x;
Anthony Barbier7068f992017-10-26 15:23:08 +01001124 }
1125 else
1126 {
Joel Liang63875432018-01-02 14:05:06 +08001127 b = vec2_b.y;
Anthony Barbier7068f992017-10-26 15:23:08 +01001128 }
1129
1130 for(i = 0; i < 3; i++)
1131 {
1132 pixels[i] += vec4(b);
1133 }
1134#endif /* BIAS */
1135
Isabella Gottardi3f217ec2018-02-12 14:59:19 +00001136#ifdef FUSED_ACTIVATION
1137 pixels[0] = ACT_OP(pixels[0]);
1138 pixels[1] = ACT_OP(pixels[1]);
1139 pixels[2] = ACT_OP(pixels[2]);
1140 pixels[3] = ACT_OP(pixels[3]);
1141#endif /* FUSED_ACTIVATION */
1142
Joel Liang63875432018-01-02 14:05:06 +08001143 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
1144 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
1145 STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
Anthony Barbier7068f992017-10-26 15:23:08 +01001146
Joel Liang63875432018-01-02 14:05:06 +08001147 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_stride_z);
Anthony Barbier7068f992017-10-26 15:23:08 +01001148 }
1149}
Joel Liang63875432018-01-02 14:05:06 +08001150
1151#endif /* PROCESS_nX_nY_nZ */
1152
1153#else /* DATA_TYPE_FP32 */
1154#error Data type not supported
1155#endif /* DATA_TYPE_FP32 */