blob: aa639b2edaecff649a019c41dae80a50dc908c7e [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
Anthony Barbier7068f992017-10-26 15:23:08 +010026
Joel Liangc5a7e592017-12-29 14:38:56 +080027#include "helpers_cs.h"
Anthony Barbier7068f992017-10-26 15:23:08 +010028
Joel Liangc5a7e592017-12-29 14:38:56 +080029#if defined(DATA_TYPE_FP16)
30precision mediump float;
31#endif // DATA_TYPE_FP16
Anthony Barbier7068f992017-10-26 15:23:08 +010032
Joel Liangc5a7e592017-12-29 14:38:56 +080033/** Performs a pooling function
34 *
35 * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
36 * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2"
37 * n must be one of these: 2, 3, 7, N
38 * Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13;
39 * @note In case of average pooling the following information must be passed at compile time:
40 * POOL_AVG must be provided otherwise max pooling will be performed.
41 * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
42 * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
43 * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
44 *
45 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
46 * @param[in] src_attrs The attributes of the source image
47 * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
48 * @param[in] src_attrs The attributes of the destination image
49 */
50SHADER_PARAMS_DECLARATION
Anthony Barbier7068f992017-10-26 15:23:08 +010051{
Joel Liangc5a7e592017-12-29 14:38:56 +080052 Tensor3DAttributes src_attrs;
53 Tensor3DAttributes dst_attrs;
Anthony Barbier7068f992017-10-26 15:23:08 +010054};
55
Joel Liangc5a7e592017-12-29 14:38:56 +080056// Common definitions
Anthony Barbier7068f992017-10-26 15:23:08 +010057#if defined(POOL_AVG) || defined(POOL_L2)
58#define POOL_OP(res, a, b) ((res) = (a) + (b))
59#define POOL_OP_float(res, a, b) (res = a + b)
60#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
61#else /* defined(POOL_AVG) || defined(POOL_L2) */
62#define POOL_OP(res, a, b) \
63 (res) = (a); \
64 if(isnan(a.x) || (a.x < b.x)) \
65 { \
66 res.x = b.x; \
67 } \
68 if(isnan(a.y) || (a.y < b.y)) \
69 { \
70 res.y = b.y; \
71 } \
72 if(isnan(a.z) || (a.z < b.z)) \
73 { \
74 res.z = b.z; \
75 } \
76 if(isnan(a.w) || (a.w < b.w)) \
77 { \
78 res.w = b.w; \
79 }
80#define POOL_OP_float(res, a, b) \
81 (res) = (a); \
82 if(isnan(a) || (a < b)) \
83 { \
84 res = b; \
85 }
86#define POOL_OP_vec2(res, a, b) \
87 (res) = (a); \
88 if(isnan(a.x) || (a.x < b.x)) \
89 { \
90 res.x = b.x; \
91 } \
92 if(isnan(a.y) || (a.y < b.y)) \
93 { \
94 res.y = b.y; \
95 }
96#endif /* defined(POOL_AVG) || defined(POOL_L2) */
97
98#if defined(POOL_L2)
99#define POW2_OP(x, vec_size) ((x) * (x))
100#else /* defined(POOL_L2) */
101#define POW2_OP(x, vec_size) (x)
102#endif /* defined(POOL_L2) */
103
104#define DIV_OP(x, y) (x * (1.f / y))
105#define SQRT_OP(x) sqrt((x))
106
Joel Liangc5a7e592017-12-29 14:38:56 +0800107#if defined(DATA_TYPE_FP32)
108
109float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
110float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
111
112TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
113TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
114
Anthony Barbier7068f992017-10-26 15:23:08 +0100115#if defined(POOL_SIZE)
116// Set the initial value for the pooling operation accordingly with the data type
117#if defined(POOL_AVG) || defined(POOL_L2)
118#define INITIAL_VALUE 0.0f
119#else /* defined(POOL_AVG) || defined(POOL_L2) */
120#define INITIAL_VALUE -3.402823466385289e+38
121#endif // POOL_AVG
122#endif //POOL_SIZE
123
Joel Liangc5a7e592017-12-29 14:38:56 +0800124float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
125{
126 int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
127 int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
128 int end_x = int(min(start_x + pool_size, upper_bound_w));
129 int end_y = int(min(start_y + pool_size, upper_bound_h));
130
131 float data_max;
132 data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter);
133
134 for(int i = 0; (start_y + i) < end_y; ++i)
135 {
136 for(int j = 0; (start_x + j) < end_x; ++j)
137 {
138 float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
139 POOL_OP_float(data_max, data_max, data);
140 }
141 }
142
143 return data_max;
144}
145
146float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
147{
148 int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
149 int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
150 int end_x = int(min(start_x + pool_size, upper_bound_w));
151 int end_y = int(min(start_y + pool_size, upper_bound_h));
152
153 float data_total = 0.0f;
154 for(int i = 0; (start_x + i) < end_x; i++)
155 {
156 for(int j = 0; (start_y + j) < end_y; ++j)
157 {
158 float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0));
159 if(isnan(data))
160 {
161 data = 0.0f;
162 }
163#if defined(POOL_L2)
164 // Raise to power of 2 for L2 Pooling
165 data = POW2_OP(data, 1);
166#endif /* defined(POOL_L2) */
167 data_total = data_total + data;
168 }
169 }
170
171#if defined(EXCLUDE_PADDING)
172 start_x = max(0, start_x);
173 start_y = max(0, start_y);
174#endif /* defined(EXCLUDE_PADDING) */
175
176 return data_total / float((end_y - start_y) * (end_x - start_x));
177}
178
179#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
180
181#if defined(POOLING_LAYER_2)
182#define POOL_SIZE 2
183#elif defined(POOLING_LAYER_3)
184#define POOL_SIZE 3
185#elif defined(POOLING_LAYER_7)
186#define POOL_SIZE 7
187#else // POOLING_LAYER_n
188#error Please define POOLING_LAYER_N instead.
189#endif // POOLING_LAYER_n
190
191void main(void)
192{
193 // Get pixels pointer
194 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
195 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
196
197 //Load and calculate data
198 float res;
199#if defined(POOL_AVG) || defined(POOL_L2)
200 res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
201#else /*POOL_AVG*/
202 res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
203#endif /*POOL_AVG*/
204
205#if defined(POOL_L2)
206 // Take square root of the result in L2 pooling
207 res = SQRT_OP(res);
208#endif /* defined(POOL_L2) */
209
210 // Store result
211 STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
212}
213
214#elif defined(POOLING_LAYER_3_OPTIMIZED)
215
216#define POOLING3x3_STRIDE1(res, input_ptr, input_iter) \
217 vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
218 vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
219 vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
220 vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
221 vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
222 vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
223 data00 = POW2_OP(data00, 4); \
224 data01 = POW2_OP(data01, 2); \
225 data10 = POW2_OP(data10, 4); \
226 data11 = POW2_OP(data11, 2); \
227 data20 = POW2_OP(data20, 4); \
228 data21 = POW2_OP(data21, 2); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100229 \
230 vec4 values000; \
231 vec4 values001; \
232 vec4 values010; \
233 vec4 values100; \
234 vec4 values101; \
235 vec4 values11; \
236 vec4 values200; \
237 vec4 values201; \
238 vec4 values21; \
239 values000.xyzw = data00.xyzy; \
240 values001.xyzw = data00.zwzw; \
241 values010.x = data01.x; \
242 values010.y = data00.w; \
243 values010.zw = data01.xy; \
244 values100.xyzw = data10.xyzy; \
245 values101.xyzw = data10.zwzw; \
246 values11.x = data11.x; \
247 values11.y = data10.w; \
248 values11.zw = data11.xy; \
249 values200.xyzw = data20.xyzy; \
250 values201.xyzw = data20.zwzw; \
251 values21.x = data21.x; \
252 values21.y = data20.w; \
253 values21.zw = data21.xy; \
254 POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
255 POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
256 POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
257 POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
258 POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
259 POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
260 POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
261 POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
262
Joel Liangc5a7e592017-12-29 14:38:56 +0800263#define POOLING3x3_STRIDE2(res, input_ptr, input_iter) \
264 vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
265 vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
266 float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \
267 vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
268 vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
269 float data11 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \
270 vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
271 vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
272 float data21 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \
273 data000 = POW2_OP(data000, 4); \
274 data001 = POW2_OP(data001, 4); \
275 data010 = POW2_OP(data010, 1); \
276 data100 = POW2_OP(data100, 4); \
277 data101 = POW2_OP(data101, 4); \
278 data11 = POW2_OP(data11, 1); \
279 data200 = POW2_OP(data200, 4); \
280 data201 = POW2_OP(data201, 4); \
281 data21 = POW2_OP(data21, 1); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100282 \
283 vec4 values000; \
284 vec4 values001; \
285 vec4 values010; \
286 vec4 values100; \
287 vec4 values101; \
288 vec4 values11; \
289 vec4 values200; \
290 vec4 values201; \
291 vec4 values21; \
292 values000.xyzw = data000.xyzz; \
293 values001.xyzw = vec4(data000.w, data001.xxy); \
294 values010.xyzw = vec4(data001.zzw, data010); \
295 values100.xyzw = data100.xyzz; \
296 values101.xyzw = vec4(data100.w, data101.xxy); \
297 values11.xyzw = vec4(data101.zzw, data11); \
298 values200.xyzw = data200.xyzz; \
299 values201.xyzw = vec4(data200.w, data201.xxy); \
300 values21.xyzw = vec4(data201.zzw, data21); \
301 POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
302 POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
303 POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
304 POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
305 POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
306 POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
307 POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
308 POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
309
Joel Liangc5a7e592017-12-29 14:38:56 +0800310#define POOLING3x3_STRIDE3(res, input_ptr, input_iter) \
311 vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
312 vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
313 vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \
314 vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
315 vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
316 vec4 data11 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \
317 vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
318 vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
319 vec4 data21 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \
320 data000 = POW2_OP(data000, 4); \
321 data001 = POW2_OP(data001, 4); \
322 data010 = POW2_OP(data010, 4); \
323 data100 = POW2_OP(data100, 4); \
324 data101 = POW2_OP(data101, 4); \
325 data11 = POW2_OP(data11, 4); \
326 data200 = POW2_OP(data200, 4); \
327 data201 = POW2_OP(data201, 4); \
328 data21 = POW2_OP(data21, 4); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100329 \
330 POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
331 POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
332 POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
333 POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
334 POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
335 POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
336 POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800337 POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
Anthony Barbier7068f992017-10-26 15:23:08 +0100338
Anthony Barbier7068f992017-10-26 15:23:08 +0100339void main(void)
340{
341 // Get pixels pointer
Joel Liangc5a7e592017-12-29 14:38:56 +0800342 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
343 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100344
345 vec4 res;
346 // Perform pooling 3x3 for 4 output elements
347#if STRIDE_X == 1
Joel Liangc5a7e592017-12-29 14:38:56 +0800348 POOLING3x3_STRIDE1(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100349#elif STRIDE_X == 2
Joel Liangc5a7e592017-12-29 14:38:56 +0800350 POOLING3x3_STRIDE2(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100351#elif STRIDE_X == 3
Joel Liangc5a7e592017-12-29 14:38:56 +0800352 POOLING3x3_STRIDE3(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100353#endif /*STRIDE_X == 1*/
354
355 // Divide by pool region in case of average pooling
356#if defined(POOL_AVG) || defined(POOL_L2)
357 ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
358 int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
359 ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
360 int end_y = min((start_y + 3), MAX_HEIGHT);
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800361#if defined(EXCLUDE_PADDING)
362 start_x = max(ivec4(0), start_x);
363 start_y = max(0, start_y);
364#endif /* defined(EXCLUDE_PADDING) */
Anthony Barbier7068f992017-10-26 15:23:08 +0100365 res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
366#endif /*POOL_AVG*/
367
368#if defined(POOL_L2)
369 // Take square root of the result in L2 pooling
370 res = SQRT_OP(res);
371#endif /* defined(POOL_L2) */
372
Joel Liangc5a7e592017-12-29 14:38:56 +0800373 VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res);
Anthony Barbier7068f992017-10-26 15:23:08 +0100374}
375
376#elif defined(POOLING_LAYER_N)
Joel Liangc5a7e592017-12-29 14:38:56 +0800377
Anthony Barbier7068f992017-10-26 15:23:08 +0100378void main(void)
379{
380 // Get pixels pointer
Joel Liangc5a7e592017-12-29 14:38:56 +0800381 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
382 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100383
Joel Liangc5a7e592017-12-29 14:38:56 +0800384 vec4 vdata0 = vec4(INITIAL_VALUE);
385 vec4 vdata1 = vec4(INITIAL_VALUE);
386 float sdata = float(INITIAL_VALUE);
Anthony Barbier7068f992017-10-26 15:23:08 +0100387
388 for(int y = 0; y < int(POOL_SIZE); y++)
389 {
390 int x = 0;
391 for(; x <= (int(POOL_SIZE) - 8); x += 8)
392 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800393 vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
394 vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100395
396#if defined(POOL_L2)
397 // Raise to power of 2 for L2 Pooling
398 data2 *= data2;
399 data3 *= data3;
400#endif /* defined(POOL_L2) */
401
402 POOL_OP(vdata0, vdata0, data2);
403 POOL_OP(vdata1, vdata1, data3);
404 }
405
406 // Leftover
407 for(; x < int(POOL_SIZE); ++x)
408 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800409 float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100410#if defined(POOL_L2)
411 // Raise to power of 2 for L2 Pooling
412 data4 *= data4;
413#endif /* defined(POOL_L2) */
414 POOL_OP_float(sdata, sdata, data4);
415 }
416 }
417
418 //Reduce result
419 vec4 reduce4;
420 POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
421 vec2 reduce2;
422 POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
423 float res;
424 POOL_OP_float(res, reduce2.x, reduce2.y);
425 POOL_OP_float(res, res, sdata);
426
427#if defined(POOL_AVG) || defined(POOL_L2)
428 {
429 // Divide by pool region in case of average pooling
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800430 int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
431 int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
432 int end_x = int(min(start_x + POOL_SIZE, MAX_WIDTH));
433 int end_y = int(min(start_y + POOL_SIZE, MAX_HEIGHT));
434#if defined(EXCLUDE_PADDING)
435 start_x = max(0, start_x);
436 start_y = max(0, start_y);
437#endif /* defined(EXCLUDE_PADDING) */
438 float res1 = float((end_y - start_y) * (end_x - start_x));
439 res = DIV_OP(res, res1);
Anthony Barbier7068f992017-10-26 15:23:08 +0100440 }
441#endif /* defined(POOL_AVG) || defined(POOL_L2) */
442
443#if defined(POOL_L2)
444 // Take square root of the result in L2 pooling
445 res = SQRT_OP(res);
446#endif /* defined(POOL_L2) */
447
448 // Store result
Joel Liangc5a7e592017-12-29 14:38:56 +0800449 STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
Anthony Barbier7068f992017-10-26 15:23:08 +0100450}
Joel Liangc5a7e592017-12-29 14:38:56 +0800451#endif // POOLING_LAYER_N
Anthony Barbier7068f992017-10-26 15:23:08 +0100452
453#elif defined(DATA_TYPE_FP16)
454
Joel Liangc5a7e592017-12-29 14:38:56 +0800455vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
456vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
Anthony Barbier7068f992017-10-26 15:23:08 +0100457
Joel Liangc5a7e592017-12-29 14:38:56 +0800458TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
459TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
Anthony Barbier7068f992017-10-26 15:23:08 +0100460
461#if defined(POOL_SIZE)
462// Set the initial value for the pooling operation accordingly with the data type
463#if defined(POOL_AVG) || defined(POOL_L2)
464#define INITIAL_VALUE 0.0f
465#else /* defined(POOL_AVG) || defined(POOL_L2) */
466#define INITIAL_VALUE -65504.0f
467#endif //POOL_AVG
468#endif //POOL_SIZE
469
Joel Liangc5a7e592017-12-29 14:38:56 +0800470vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
471{
472 int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
473 int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
474 int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
475 int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
476
477 int start_x2 = start_x1 + stride_x;
478 int start_y2 = start_y1;
479 int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
480 int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
481
482 //Initialize maximum
483 vec2 data_max = vec2(0);
484
485 //Load and Set initial maximum1
486 vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
487 data_max.x = data_init1.x;
488
489 //Load and Set initial maximum2
490 if(end_x1 < upper_bound_w)
491 {
492 if((stride_x % 2) == 0)
493 {
494 vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0));
495 data_max.y = data_init2.x;
496 }
497 else
498 {
499 vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0));
500 data_max.y = data_init2.y;
501 }
502 }
503
504 for(int i = 0; (start_y1 + i) < end_y1; i++)
505 for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
506 {
507 //Calculate maximum1
508 if((start_x1 + j + 1) < end_x1)
509 {
510 vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
511 float data_mr1;
512 POOL_OP_float(data_mr1, data1.x, data1.y);
513 POOL_OP_float(data_max.x, data_max.x, data_mr1);
514 }
515 else
516 {
517 vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
518 POOL_OP_float(data_max.x, data_max.x, data1.x);
519 }
520
521 //Calculate maximum2
522 if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
523 {
524 if((stride_x % 2) == 0)
525 {
526 vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
527
528 if((start_x2 + j + 1) < end_x2)
529 {
530 float data_mr2;
531 POOL_OP_float(data_mr2, data2.x, data2.y);
532 POOL_OP_float(data_max.y, data_max.y, data_mr2);
533 }
534 else
535 {
536 POOL_OP_float(data_max.y, data_max.y, data2.x);
537 }
538 }
539 else
540 {
541 vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
542 vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
543 if((start_x2 + j + 1) < end_x2)
544 {
545 float data_mr2;
546 POOL_OP_float(data_mr2, data3.x, data2.y);
547 POOL_OP_float(data_max.y, data_max.y, data_mr2);
548 }
549 else
550 {
551 POOL_OP_float(data_max.y, data_max.y, data2.y);
552 }
553 }
554 }
555 }
556 return data_max;
557}
558
559vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
560{
561 int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x;
562 int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
563 int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
564 int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
565
566 int start_x2 = start_x1 + stride_x;
567 int start_y2 = start_y1;
568 int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
569 int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
570
571 //Initialize sum
572 float data_total1 = float(0);
573 float data_total2 = float(0);
574 for(int i = 0; (start_y1 + i) < end_y1; i++)
575 for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
576 {
577 vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
578#if defined(POOL_L2)
579 // Raise to power of 2 for L2 Pooling
580 data1 = POW2_OP(data1, 2);
581#endif /* defined(POOL_L2) */
582 //Calculate sum1
583 if((start_x1 + j + 1) < end_x1)
584 {
585 data_total1 = data_total1 + data1.x + data1.y;
586 }
587 else
588 {
589 data_total1 = data_total1 + data1.x;
590 }
591
592 //Calculate sum2
593 if((start_x2 + j) < end_x2 && end_x1 <= upper_bound_w)
594 {
595 if((stride_x % 2) == 0)
596 {
597 vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
598#if defined(POOL_L2)
599 // Raise to power of 2 for L2 Pooling
600 data2 = POW2_OP(data2, 2);
601#endif /* defined(POOL_L2) */
602 if((start_x2 + j + 1) < end_x2)
603 {
604 data_total2 = data_total2 + data2.x + data2.y;
605 }
606 else
607 {
608 data_total2 = data_total2 + data2.x;
609 }
610 }
611 else
612 {
613 vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
614 vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
615#if defined(POOL_L2)
616 // Raise to power of 2 for L2 Pooling
617 data2 = POW2_OP(data2, 2);
618 data3 = POW2_OP(data3, 2);
619#endif /* defined(POOL_L2) */
620 if((start_x2 + j + 1) < end_x2)
621 {
622 data_total2 = data_total2 + data3.x + data2.y;
623 }
624 else
625 {
626 data_total2 = data_total2 + data2.y;
627 }
628 }
629 }
630 }
631#if defined(EXCLUDE_PADDING)
632 start_x1 = max(0, start_x1);
633 start_y1 = max(0, start_y1);
634 start_x2 = max(0, start_x2);
635 start_y2 = max(0, start_y2);
636#endif /* defined(EXCLUDE_PADDING) */
637
638 //Calculate average
639 vec2 data_avg;
640 data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
641 data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
642
643 return data_avg;
644}
645
646#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
647
648#if defined(POOLING_LAYER_2)
649#define POOL_SIZE 2
650#elif defined(POOLING_LAYER_3)
651#define POOL_SIZE 3
652#elif defined(POOLING_LAYER_7)
653#define POOL_SIZE 7
654#else // POOLING_LAYER_n
655#error Please define POOLING_LAYER_N instead.
656#endif // POOLING_LAYER_n
657
658void main(void)
659{
660 // Get pixels pointer
661 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
662 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
663
664 //Load and calculate data
665 vec2 data;
666#if defined(POOL_AVG) || defined(POOL_L2)
667 data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
668#else /*POOL_AVG*/
669 data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
670#endif /*POOL_AVG*/
671
672#if defined(POOL_L2)
673 // Take square root of the result in L2 pooling
674 data = SQRT_OP(data);
675#endif /* defined(POOL_L2) */
676
677 // Store result
678 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
679}
680
681#elif defined(POOLING_LAYER_3_OPTIMIZED)
682
683#define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter) \
684 vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
685 vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
686 vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
687 vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
688 vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
689 vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
690 data00 = POW2_OP(data00, 4); \
691 data01 = POW2_OP(data01, 2); \
692 data10 = POW2_OP(data10, 4); \
693 data11 = POW2_OP(data11, 2); \
694 data20 = POW2_OP(data20, 4); \
695 data21 = POW2_OP(data21, 2); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100696 \
697 vec4 values000; \
698 vec4 values001; \
699 vec4 values010; \
700 vec4 values100; \
701 vec4 values101; \
702 vec4 values11; \
703 vec4 values200; \
704 vec4 values201; \
705 vec4 values21; \
706 values000.xyzw = data00.xyzy; \
707 values001.xyzw = data00.zwzw; \
708 values010.x = data01.x; \
709 values010.y = data00.w; \
710 values010.zw = data01.xy; \
711 values100.xyzw = data10.xyzy; \
712 values101.xyzw = data10.zwzw; \
713 values11.x = data11.x; \
714 values11.y = data10.w; \
715 values11.zw = data11.xy; \
716 values200.xyzw = data20.xyzy; \
717 values201.xyzw = data20.zwzw; \
718 values21.x = data21.x; \
719 values21.y = data20.w; \
720 values21.zw = data21.xy; \
721 POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
722 POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
723 POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
724 POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
725 POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
726 POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
727 POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
728 POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
729
Joel Liangc5a7e592017-12-29 14:38:56 +0800730#define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter) \
Anthony Barbier7068f992017-10-26 15:23:08 +0100731 vec4 data000; \
732 vec4 data001; \
733 float data010; \
734 vec4 data100; \
735 vec4 data101; \
736 float data11; \
737 vec4 data200; \
738 vec4 data201; \
739 float data21; \
740 vec2 datamiddle0; \
741 vec2 datamiddle1; \
742 vec2 datamiddle2; \
Joel Liangc5a7e592017-12-29 14:38:56 +0800743 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
744 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
745 datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100746 data010 = datamiddle0.x; \
Joel Liangc5a7e592017-12-29 14:38:56 +0800747 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
748 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
749 datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100750 data11 = datamiddle1.x; \
Joel Liangc5a7e592017-12-29 14:38:56 +0800751 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
752 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
753 datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100754 data21 = datamiddle2.x; \
755 data000 = POW2_OP(data000, 4); \
756 data001 = POW2_OP(data001, 4); \
757 data010 = POW2_OP(data010, 1); \
758 data100 = POW2_OP(data100, 4); \
759 data101 = POW2_OP(data101, 4); \
760 data11 = POW2_OP(data11, 1); \
761 data200 = POW2_OP(data200, 4); \
762 data201 = POW2_OP(data201, 4); \
763 data21 = POW2_OP(data21, 1); \
764 \
765 vec4 values000; \
766 vec4 values001; \
767 vec4 values010; \
768 vec4 values100; \
769 vec4 values101; \
770 vec4 values11; \
771 vec4 values200; \
772 vec4 values201; \
773 vec4 values21; \
774 values000.xyzw = data000.xyzz; \
775 values001.xyzw = vec4(data000.w, data001.xxy); \
776 values010.xyzw = vec4(data001.zzw, data010); \
777 values100.xyzw = data100.xyzz; \
778 values101.xyzw = vec4(data100.w, data101.xxy); \
779 values11.xyzw = vec4(data101.zzw, data11); \
780 values200.xyzw = data200.xyzz; \
781 values201.xyzw = vec4(data200.w, data201.xxy); \
782 values21.xyzw = vec4(data201.zzw, data21); \
783 POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
784 POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
785 POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
786 POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
787 POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
788 POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
789 POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
790 POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
791
Joel Liangc5a7e592017-12-29 14:38:56 +0800792#define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter) \
793 vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
794 vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
795 vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
796 vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
797 vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
798 vec4 data11 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
799 vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
800 vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
801 vec4 data21 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
802 data000 = POW2_OP(data000, 4); \
803 data001 = POW2_OP(data001, 4); \
804 data010 = POW2_OP(data010, 4); \
805 data100 = POW2_OP(data100, 4); \
806 data101 = POW2_OP(data101, 4); \
807 data11 = POW2_OP(data11, 4); \
808 data200 = POW2_OP(data200, 4); \
809 data201 = POW2_OP(data201, 4); \
810 data21 = POW2_OP(data21, 4); \
Anthony Barbier7068f992017-10-26 15:23:08 +0100811 \
812 POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
813 POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
814 POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
815 POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
816 POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
817 POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
818 POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800819 POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
Anthony Barbier7068f992017-10-26 15:23:08 +0100820
Anthony Barbier7068f992017-10-26 15:23:08 +0100821void main(void)
822{
823 // Get pixels pointer
Joel Liangc5a7e592017-12-29 14:38:56 +0800824 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
825 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100826
827 vec4 res;
828 // Perform pooling 3x3 for 4 output elements
829#if STRIDE_X == 1
Joel Liangc5a7e592017-12-29 14:38:56 +0800830 POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100831#elif STRIDE_X == 2
Joel Liangc5a7e592017-12-29 14:38:56 +0800832 POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100833#elif STRIDE_X == 3
Joel Liangc5a7e592017-12-29 14:38:56 +0800834 POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100835#endif /*STRIDE_X == 1*/
836
837 // Divide by pool region in case of average pooling
838#if defined(POOL_AVG) || defined(POOL_L2)
839 ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
840 int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
841 ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
842 int end_y = min((start_y + 3), MAX_HEIGHT);
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800843#if defined(EXCLUDE_PADDING)
844 start_x = max(ivec4(0), start_x);
845 start_y = max(0, start_y);
846#endif /* defined(EXCLUDE_PADDING) */
Anthony Barbier7068f992017-10-26 15:23:08 +0100847 res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
848#endif /*POOL_AVG*/
849
850#if defined(POOL_L2)
851 // Take square root of the result in L2 pooling
852 res = SQRT_OP(res);
853#endif /* defined(POOL_L2) */
854
Joel Liangc5a7e592017-12-29 14:38:56 +0800855 VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
Anthony Barbier7068f992017-10-26 15:23:08 +0100856}
857
858#elif defined(POOLING_LAYER_N)
Joel Liangc5a7e592017-12-29 14:38:56 +0800859
Anthony Barbier7068f992017-10-26 15:23:08 +0100860void main(void)
861{
862 // Get pixels pointer
Joel Liangc5a7e592017-12-29 14:38:56 +0800863 Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
864 Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100865
Joel Liangc5a7e592017-12-29 14:38:56 +0800866 vec4 vdata00 = vec4(INITIAL_VALUE);
867 vec4 vdata01 = vec4(INITIAL_VALUE);
868 vec4 vdata10 = vec4(INITIAL_VALUE);
869 vec4 vdata11 = vec4(INITIAL_VALUE);
870 vec2 sdata = vec2(INITIAL_VALUE);
Anthony Barbier7068f992017-10-26 15:23:08 +0100871
872 for(int y = 0; y < int(POOL_SIZE); y++)
873 {
874 int x = 0;
875 for(; x <= (int(POOL_SIZE) - 8); x += 8)
876 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800877 vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
878 vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100879
880#if defined(POOL_L2)
881 // Raise to power of 2 for L2 Pooling
882 data2 *= data2;
883 data3 *= data3;
884#endif /* defined(POOL_L2) */
885
886 POOL_OP(vdata00, vdata00, data2);
887 POOL_OP(vdata10, vdata10, data3);
888 }
889
890 // Leftover
891 for(; x < int(POOL_SIZE); x = x + 2)
892 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800893 vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
Anthony Barbier7068f992017-10-26 15:23:08 +0100894#if defined(POOL_L2)
895 // Raise to power of 2 for L2 Pooling
896 data4middle *= data4middle;
897#endif /* defined(POOL_L2) */
898 if((x + 1) >= int(POOL_SIZE))
899 {
900 POOL_OP_float(sdata.x, sdata.x, data4middle.x);
901 }
902 else
903 {
904 float data4;
905 POOL_OP_float(data4, data4middle.x, data4middle.y);
906 POOL_OP_float(sdata.x, sdata.x, data4);
907 }
908 }
909 }
910
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800911 for(int y = 0; y < int(POOL_SIZE); y++)
Anthony Barbier7068f992017-10-26 15:23:08 +0100912 {
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800913 if((STRIDE_X % 2) == 0)
Anthony Barbier7068f992017-10-26 15:23:08 +0100914 {
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800915 int x1 = STRIDE_X;
916 for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
Anthony Barbier7068f992017-10-26 15:23:08 +0100917 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800918 vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
919 vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800920
921#if defined(POOL_L2)
922 // Raise to power of 2 for L2 Pooling
923 data2 *= data2;
924 data3 *= data3;
925#endif /* defined(POOL_L2) */
926
927 POOL_OP(vdata01, vdata01, data2);
928 POOL_OP(vdata11, vdata11, data3);
Anthony Barbier7068f992017-10-26 15:23:08 +0100929 }
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800930
931 // Leftover
932 for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
Anthony Barbier7068f992017-10-26 15:23:08 +0100933 {
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800934 vec2 data4middle;
Joel Liangc5a7e592017-12-29 14:38:56 +0800935 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800936#if defined(POOL_L2)
937 // Raise to power of 2 for L2 Pooling
938 data4middle *= data4middle;
939#endif /* defined(POOL_L2) */
940 if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
941 {
942 POOL_OP_float(sdata.y, sdata.y, data4middle.x);
943 }
944 else
945 {
946 float data4;
947 POOL_OP_float(data4, data4middle.x, data4middle.y);
948 POOL_OP_float(sdata.y, sdata.y, data4);
949 }
950 }
951 }
952 else
953 {
954 vec2 dataorigin2;
Joel Liangc5a7e592017-12-29 14:38:56 +0800955 dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0));
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800956#if defined(POOL_L2)
957 // Raise to power of 2 for L2 Pooling
958 dataorigin2.y *= dataorigin2.y;
959#endif /* defined(POOL_L2) */
960 POOL_OP_float(sdata.y, sdata.y, dataorigin2.y);
961
962 int x1 = STRIDE_X + 1;
963 for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
964 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800965 vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
966 vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800967
968#if defined(POOL_L2)
969 // Raise to power of 2 for L2 Pooling
970 data2 *= data2;
971 data3 *= data3;
972#endif /* defined(POOL_L2) */
973
974 POOL_OP(vdata01, vdata01, data2);
975 POOL_OP(vdata11, vdata11, data3);
976 }
977
978 // Leftover
979 for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
980 {
Joel Liangc5a7e592017-12-29 14:38:56 +0800981 vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
Xinghang Zhou53a6ec52017-11-14 15:14:25 +0800982#if defined(POOL_L2)
983 // Raise to power of 2 for L2 Pooling
984 data4middle *= data4middle;
985#endif /* defined(POOL_L2) */
986 if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
987 {
988 POOL_OP_float(sdata.y, sdata.y, data4middle.x);
989 }
990 else
991 {
992 float data4;
993 POOL_OP_float(data4, data4middle.x, data4middle.y);
994 POOL_OP_float(sdata.y, sdata.y, data4);
995 }
Anthony Barbier7068f992017-10-26 15:23:08 +0100996 }
997 }
998 }
999
1000 //Reduce result
1001 vec4 reduce40;
1002 POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
1003 vec2 reduce20;
1004 POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
1005 vec4 reduce41;
1006 POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
1007 vec2 reduce21;
1008 POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
1009 vec2 data;
1010 POOL_OP_float(data.x, reduce20.x, reduce20.y);
1011 POOL_OP_float(data.x, data.x, sdata.x);
1012 POOL_OP_float(data.y, reduce21.x, reduce21.y);
1013 POOL_OP_float(data.y, data.y, sdata.y);
1014
1015#if defined(POOL_AVG) || defined(POOL_L2)
1016 {
1017 // Divide by pool region in case of average pooling
Xinghang Zhou53a6ec52017-11-14 15:14:25 +08001018 int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * STRIDE_X - PAD_X;
1019 int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
1020 int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
1021 int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
1022 int start_x2 = start_x1 + STRIDE_X;
1023 int start_y2 = start_y1;
1024 int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
1025 int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
1026#if defined(EXCLUDE_PADDING)
1027 start_x1 = max(0, start_x1);
1028 start_y1 = max(0, start_y1);
1029 start_x2 = max(0, start_x2);
1030 start_y2 = max(0, start_y2);
1031#endif /* defined(EXCLUDE_PADDING) */
Anthony Barbier7068f992017-10-26 15:23:08 +01001032 vec2 res1;
1033 res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
1034 res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
1035 data.x = DIV_OP(data.x, res1.x);
1036 data.y = DIV_OP(data.y, res1.y);
1037 }
1038#endif /* defined(POOL_AVG) || defined(POOL_L2) */
1039
1040#if defined(POOL_L2)
1041 // Take square root of the result in L2 pooling
1042 data = SQRT_OP(data);
1043#endif /* defined(POOL_L2) */
Anthony Barbier7068f992017-10-26 15:23:08 +01001044
1045 // Store result
Joel Liangc5a7e592017-12-29 14:38:56 +08001046 STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
Anthony Barbier7068f992017-10-26 15:23:08 +01001047}
Joel Liangc5a7e592017-12-29 14:38:56 +08001048#endif // POOLING_LAYER_N
1049
1050#else // DATA_TYPE_FP32
1051#error Data type not supported
1052#endif // DATA_TYPE_FP32