Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2017 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 26 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 27 | #include "helpers_cs.h" |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 28 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 29 | #if defined(DATA_TYPE_FP16) |
| 30 | precision mediump float; |
| 31 | #endif // DATA_TYPE_FP16 |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 32 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 33 | /** Performs a pooling function |
| 34 | * |
| 35 | * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" |
| 36 | * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2" |
| 37 | * n must be one of these: 2, 3, 7, N |
| 38 | * Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13; |
| 39 | * @note In case of average pooling the following information must be passed at compile time: |
| 40 | * POOL_AVG must be provided otherwise max pooling will be performed. |
| 41 | * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) |
| 42 | * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions |
| 43 | * PAD_X and PAD_Y which are the pooling paddings in x and y dimension |
| 44 | * |
| 45 | * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 |
| 46 | * @param[in] src_attrs The attributes of the source image |
| 47 | * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr |
| 48 | * @param[in] src_attrs The attributes of the destination image |
| 49 | */ |
| 50 | SHADER_PARAMS_DECLARATION |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 51 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 52 | Tensor3DAttributes src_attrs; |
| 53 | Tensor3DAttributes dst_attrs; |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 54 | }; |
| 55 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 56 | // Common definitions |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 57 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 58 | #define POOL_OP(res, a, b) ((res) = (a) + (b)) |
| 59 | #define POOL_OP_float(res, a, b) (res = a + b) |
| 60 | #define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) |
| 61 | #else /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 62 | #define POOL_OP(res, a, b) \ |
| 63 | (res) = (a); \ |
| 64 | if(isnan(a.x) || (a.x < b.x)) \ |
| 65 | { \ |
| 66 | res.x = b.x; \ |
| 67 | } \ |
| 68 | if(isnan(a.y) || (a.y < b.y)) \ |
| 69 | { \ |
| 70 | res.y = b.y; \ |
| 71 | } \ |
| 72 | if(isnan(a.z) || (a.z < b.z)) \ |
| 73 | { \ |
| 74 | res.z = b.z; \ |
| 75 | } \ |
| 76 | if(isnan(a.w) || (a.w < b.w)) \ |
| 77 | { \ |
| 78 | res.w = b.w; \ |
| 79 | } |
| 80 | #define POOL_OP_float(res, a, b) \ |
| 81 | (res) = (a); \ |
| 82 | if(isnan(a) || (a < b)) \ |
| 83 | { \ |
| 84 | res = b; \ |
| 85 | } |
| 86 | #define POOL_OP_vec2(res, a, b) \ |
| 87 | (res) = (a); \ |
| 88 | if(isnan(a.x) || (a.x < b.x)) \ |
| 89 | { \ |
| 90 | res.x = b.x; \ |
| 91 | } \ |
| 92 | if(isnan(a.y) || (a.y < b.y)) \ |
| 93 | { \ |
| 94 | res.y = b.y; \ |
| 95 | } |
| 96 | #endif /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 97 | |
| 98 | #if defined(POOL_L2) |
| 99 | #define POW2_OP(x, vec_size) ((x) * (x)) |
| 100 | #else /* defined(POOL_L2) */ |
| 101 | #define POW2_OP(x, vec_size) (x) |
| 102 | #endif /* defined(POOL_L2) */ |
| 103 | |
| 104 | #define DIV_OP(x, y) (x * (1.f / y)) |
| 105 | #define SQRT_OP(x) sqrt((x)) |
| 106 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 107 | #if defined(DATA_TYPE_FP32) |
| 108 | |
| 109 | float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); |
| 110 | float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); |
| 111 | |
| 112 | TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); |
| 113 | TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly); |
| 114 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 115 | #if defined(POOL_SIZE) |
| 116 | // Set the initial value for the pooling operation accordingly with the data type |
| 117 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 118 | #define INITIAL_VALUE 0.0f |
| 119 | #else /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 120 | #define INITIAL_VALUE -3.402823466385289e+38 |
| 121 | #endif // POOL_AVG |
| 122 | #endif //POOL_SIZE |
| 123 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 124 | float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) |
| 125 | { |
| 126 | int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; |
| 127 | int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; |
| 128 | int end_x = int(min(start_x + pool_size, upper_bound_w)); |
| 129 | int end_y = int(min(start_y + pool_size, upper_bound_h)); |
| 130 | |
| 131 | float data_max; |
| 132 | data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 133 | |
| 134 | for(int i = 0; (start_y + i) < end_y; ++i) |
| 135 | { |
| 136 | for(int j = 0; (start_x + j) < end_x; ++j) |
| 137 | { |
| 138 | float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); |
| 139 | POOL_OP_float(data_max, data_max, data); |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | return data_max; |
| 144 | } |
| 145 | |
| 146 | float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) |
| 147 | { |
| 148 | int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; |
| 149 | int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; |
| 150 | int end_x = int(min(start_x + pool_size, upper_bound_w)); |
| 151 | int end_y = int(min(start_y + pool_size, upper_bound_h)); |
| 152 | |
| 153 | float data_total = 0.0f; |
| 154 | for(int i = 0; (start_x + i) < end_x; i++) |
| 155 | { |
| 156 | for(int j = 0; (start_y + j) < end_y; ++j) |
| 157 | { |
| 158 | float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0)); |
| 159 | if(isnan(data)) |
| 160 | { |
| 161 | data = 0.0f; |
| 162 | } |
| 163 | #if defined(POOL_L2) |
| 164 | // Raise to power of 2 for L2 Pooling |
| 165 | data = POW2_OP(data, 1); |
| 166 | #endif /* defined(POOL_L2) */ |
| 167 | data_total = data_total + data; |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | #if defined(EXCLUDE_PADDING) |
| 172 | start_x = max(0, start_x); |
| 173 | start_y = max(0, start_y); |
| 174 | #endif /* defined(EXCLUDE_PADDING) */ |
| 175 | |
| 176 | return data_total / float((end_y - start_y) * (end_x - start_x)); |
| 177 | } |
| 178 | |
| 179 | #if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7) |
| 180 | |
| 181 | #if defined(POOLING_LAYER_2) |
| 182 | #define POOL_SIZE 2 |
| 183 | #elif defined(POOLING_LAYER_3) |
| 184 | #define POOL_SIZE 3 |
| 185 | #elif defined(POOLING_LAYER_7) |
| 186 | #define POOL_SIZE 7 |
| 187 | #else // POOLING_LAYER_n |
| 188 | #error Please define POOLING_LAYER_N instead. |
| 189 | #endif // POOLING_LAYER_n |
| 190 | |
| 191 | void main(void) |
| 192 | { |
| 193 | // Get pixels pointer |
| 194 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 195 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
| 196 | |
| 197 | //Load and calculate data |
| 198 | float res; |
| 199 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 200 | res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); |
| 201 | #else /*POOL_AVG*/ |
| 202 | res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); |
| 203 | #endif /*POOL_AVG*/ |
| 204 | |
| 205 | #if defined(POOL_L2) |
| 206 | // Take square root of the result in L2 pooling |
| 207 | res = SQRT_OP(res); |
| 208 | #endif /* defined(POOL_L2) */ |
| 209 | |
| 210 | // Store result |
| 211 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, res); |
| 212 | } |
| 213 | |
| 214 | #elif defined(POOLING_LAYER_3_OPTIMIZED) |
| 215 | |
| 216 | #define POOLING3x3_STRIDE1(res, input_ptr, input_iter) \ |
| 217 | vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 218 | vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ |
| 219 | vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 220 | vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ |
| 221 | vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 222 | vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ |
| 223 | data00 = POW2_OP(data00, 4); \ |
| 224 | data01 = POW2_OP(data01, 2); \ |
| 225 | data10 = POW2_OP(data10, 4); \ |
| 226 | data11 = POW2_OP(data11, 2); \ |
| 227 | data20 = POW2_OP(data20, 4); \ |
| 228 | data21 = POW2_OP(data21, 2); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 229 | \ |
| 230 | vec4 values000; \ |
| 231 | vec4 values001; \ |
| 232 | vec4 values010; \ |
| 233 | vec4 values100; \ |
| 234 | vec4 values101; \ |
| 235 | vec4 values11; \ |
| 236 | vec4 values200; \ |
| 237 | vec4 values201; \ |
| 238 | vec4 values21; \ |
| 239 | values000.xyzw = data00.xyzy; \ |
| 240 | values001.xyzw = data00.zwzw; \ |
| 241 | values010.x = data01.x; \ |
| 242 | values010.y = data00.w; \ |
| 243 | values010.zw = data01.xy; \ |
| 244 | values100.xyzw = data10.xyzy; \ |
| 245 | values101.xyzw = data10.zwzw; \ |
| 246 | values11.x = data11.x; \ |
| 247 | values11.y = data10.w; \ |
| 248 | values11.zw = data11.xy; \ |
| 249 | values200.xyzw = data20.xyzy; \ |
| 250 | values201.xyzw = data20.zwzw; \ |
| 251 | values21.x = data21.x; \ |
| 252 | values21.y = data20.w; \ |
| 253 | values21.zw = data21.xy; \ |
| 254 | POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ |
| 255 | POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ |
| 256 | POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ |
| 257 | POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ |
| 258 | POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ |
| 259 | POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ |
| 260 | POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ |
| 261 | POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) |
| 262 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 263 | #define POOLING3x3_STRIDE2(res, input_ptr, input_iter) \ |
| 264 | vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 265 | vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ |
| 266 | float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \ |
| 267 | vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 268 | vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ |
| 269 | float data11 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \ |
| 270 | vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 271 | vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ |
| 272 | float data21 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \ |
| 273 | data000 = POW2_OP(data000, 4); \ |
| 274 | data001 = POW2_OP(data001, 4); \ |
| 275 | data010 = POW2_OP(data010, 1); \ |
| 276 | data100 = POW2_OP(data100, 4); \ |
| 277 | data101 = POW2_OP(data101, 4); \ |
| 278 | data11 = POW2_OP(data11, 1); \ |
| 279 | data200 = POW2_OP(data200, 4); \ |
| 280 | data201 = POW2_OP(data201, 4); \ |
| 281 | data21 = POW2_OP(data21, 1); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 282 | \ |
| 283 | vec4 values000; \ |
| 284 | vec4 values001; \ |
| 285 | vec4 values010; \ |
| 286 | vec4 values100; \ |
| 287 | vec4 values101; \ |
| 288 | vec4 values11; \ |
| 289 | vec4 values200; \ |
| 290 | vec4 values201; \ |
| 291 | vec4 values21; \ |
| 292 | values000.xyzw = data000.xyzz; \ |
| 293 | values001.xyzw = vec4(data000.w, data001.xxy); \ |
| 294 | values010.xyzw = vec4(data001.zzw, data010); \ |
| 295 | values100.xyzw = data100.xyzz; \ |
| 296 | values101.xyzw = vec4(data100.w, data101.xxy); \ |
| 297 | values11.xyzw = vec4(data101.zzw, data11); \ |
| 298 | values200.xyzw = data200.xyzz; \ |
| 299 | values201.xyzw = vec4(data200.w, data201.xxy); \ |
| 300 | values21.xyzw = vec4(data201.zzw, data21); \ |
| 301 | POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ |
| 302 | POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ |
| 303 | POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ |
| 304 | POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ |
| 305 | POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ |
| 306 | POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ |
| 307 | POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ |
| 308 | POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) |
| 309 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 310 | #define POOLING3x3_STRIDE3(res, input_ptr, input_iter) \ |
| 311 | vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 312 | vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ |
| 313 | vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \ |
| 314 | vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 315 | vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ |
| 316 | vec4 data11 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \ |
| 317 | vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 318 | vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ |
| 319 | vec4 data21 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \ |
| 320 | data000 = POW2_OP(data000, 4); \ |
| 321 | data001 = POW2_OP(data001, 4); \ |
| 322 | data010 = POW2_OP(data010, 4); \ |
| 323 | data100 = POW2_OP(data100, 4); \ |
| 324 | data101 = POW2_OP(data101, 4); \ |
| 325 | data11 = POW2_OP(data11, 4); \ |
| 326 | data200 = POW2_OP(data200, 4); \ |
| 327 | data201 = POW2_OP(data201, 4); \ |
| 328 | data21 = POW2_OP(data21, 4); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 329 | \ |
| 330 | POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ |
| 331 | POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ |
| 332 | POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ |
| 333 | POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ |
| 334 | POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ |
| 335 | POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ |
| 336 | POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 337 | POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 338 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 339 | void main(void) |
| 340 | { |
| 341 | // Get pixels pointer |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 342 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 343 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 344 | |
| 345 | vec4 res; |
| 346 | // Perform pooling 3x3 for 4 output elements |
| 347 | #if STRIDE_X == 1 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 348 | POOLING3x3_STRIDE1(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 349 | #elif STRIDE_X == 2 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 350 | POOLING3x3_STRIDE2(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 351 | #elif STRIDE_X == 3 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 352 | POOLING3x3_STRIDE3(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 353 | #endif /*STRIDE_X == 1*/ |
| 354 | |
| 355 | // Divide by pool region in case of average pooling |
| 356 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 357 | ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); |
| 358 | int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; |
| 359 | ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); |
| 360 | int end_y = min((start_y + 3), MAX_HEIGHT); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 361 | #if defined(EXCLUDE_PADDING) |
| 362 | start_x = max(ivec4(0), start_x); |
| 363 | start_y = max(0, start_y); |
| 364 | #endif /* defined(EXCLUDE_PADDING) */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 365 | res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); |
| 366 | #endif /*POOL_AVG*/ |
| 367 | |
| 368 | #if defined(POOL_L2) |
| 369 | // Take square root of the result in L2 pooling |
| 370 | res = SQRT_OP(res); |
| 371 | #endif /* defined(POOL_L2) */ |
| 372 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 373 | VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 374 | } |
| 375 | |
| 376 | #elif defined(POOLING_LAYER_N) |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 377 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 378 | void main(void) |
| 379 | { |
| 380 | // Get pixels pointer |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 381 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 382 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 383 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 384 | vec4 vdata0 = vec4(INITIAL_VALUE); |
| 385 | vec4 vdata1 = vec4(INITIAL_VALUE); |
| 386 | float sdata = float(INITIAL_VALUE); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 387 | |
| 388 | for(int y = 0; y < int(POOL_SIZE); y++) |
| 389 | { |
| 390 | int x = 0; |
| 391 | for(; x <= (int(POOL_SIZE) - 8); x += 8) |
| 392 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 393 | vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); |
| 394 | vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 395 | |
| 396 | #if defined(POOL_L2) |
| 397 | // Raise to power of 2 for L2 Pooling |
| 398 | data2 *= data2; |
| 399 | data3 *= data3; |
| 400 | #endif /* defined(POOL_L2) */ |
| 401 | |
| 402 | POOL_OP(vdata0, vdata0, data2); |
| 403 | POOL_OP(vdata1, vdata1, data3); |
| 404 | } |
| 405 | |
| 406 | // Leftover |
| 407 | for(; x < int(POOL_SIZE); ++x) |
| 408 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 409 | float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 410 | #if defined(POOL_L2) |
| 411 | // Raise to power of 2 for L2 Pooling |
| 412 | data4 *= data4; |
| 413 | #endif /* defined(POOL_L2) */ |
| 414 | POOL_OP_float(sdata, sdata, data4); |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | //Reduce result |
| 419 | vec4 reduce4; |
| 420 | POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw); |
| 421 | vec2 reduce2; |
| 422 | POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw); |
| 423 | float res; |
| 424 | POOL_OP_float(res, reduce2.x, reduce2.y); |
| 425 | POOL_OP_float(res, res, sdata); |
| 426 | |
| 427 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 428 | { |
| 429 | // Divide by pool region in case of average pooling |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 430 | int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X; |
| 431 | int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; |
| 432 | int end_x = int(min(start_x + POOL_SIZE, MAX_WIDTH)); |
| 433 | int end_y = int(min(start_y + POOL_SIZE, MAX_HEIGHT)); |
| 434 | #if defined(EXCLUDE_PADDING) |
| 435 | start_x = max(0, start_x); |
| 436 | start_y = max(0, start_y); |
| 437 | #endif /* defined(EXCLUDE_PADDING) */ |
| 438 | float res1 = float((end_y - start_y) * (end_x - start_x)); |
| 439 | res = DIV_OP(res, res1); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 440 | } |
| 441 | #endif /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 442 | |
| 443 | #if defined(POOL_L2) |
| 444 | // Take square root of the result in L2 pooling |
| 445 | res = SQRT_OP(res); |
| 446 | #endif /* defined(POOL_L2) */ |
| 447 | |
| 448 | // Store result |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 449 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, res); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 450 | } |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 451 | #endif // POOLING_LAYER_N |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 452 | |
| 453 | #elif defined(DATA_TYPE_FP16) |
| 454 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 455 | vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); |
| 456 | vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 457 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 458 | TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); |
| 459 | TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 460 | |
| 461 | #if defined(POOL_SIZE) |
| 462 | // Set the initial value for the pooling operation accordingly with the data type |
| 463 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 464 | #define INITIAL_VALUE 0.0f |
| 465 | #else /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 466 | #define INITIAL_VALUE -65504.0f |
| 467 | #endif //POOL_AVG |
| 468 | #endif //POOL_SIZE |
| 469 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 470 | vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) |
| 471 | { |
| 472 | int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; |
| 473 | int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; |
| 474 | int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); |
| 475 | int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); |
| 476 | |
| 477 | int start_x2 = start_x1 + stride_x; |
| 478 | int start_y2 = start_y1; |
| 479 | int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); |
| 480 | int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); |
| 481 | |
| 482 | //Initialize maximum |
| 483 | vec2 data_max = vec2(0); |
| 484 | |
| 485 | //Load and Set initial maximum1 |
| 486 | vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); |
| 487 | data_max.x = data_init1.x; |
| 488 | |
| 489 | //Load and Set initial maximum2 |
| 490 | if(end_x1 < upper_bound_w) |
| 491 | { |
| 492 | if((stride_x % 2) == 0) |
| 493 | { |
| 494 | vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0)); |
| 495 | data_max.y = data_init2.x; |
| 496 | } |
| 497 | else |
| 498 | { |
| 499 | vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0)); |
| 500 | data_max.y = data_init2.y; |
| 501 | } |
| 502 | } |
| 503 | |
| 504 | for(int i = 0; (start_y1 + i) < end_y1; i++) |
| 505 | for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) |
| 506 | { |
| 507 | //Calculate maximum1 |
| 508 | if((start_x1 + j + 1) < end_x1) |
| 509 | { |
| 510 | vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); |
| 511 | float data_mr1; |
| 512 | POOL_OP_float(data_mr1, data1.x, data1.y); |
| 513 | POOL_OP_float(data_max.x, data_max.x, data_mr1); |
| 514 | } |
| 515 | else |
| 516 | { |
| 517 | vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); |
| 518 | POOL_OP_float(data_max.x, data_max.x, data1.x); |
| 519 | } |
| 520 | |
| 521 | //Calculate maximum2 |
| 522 | if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w) |
| 523 | { |
| 524 | if((stride_x % 2) == 0) |
| 525 | { |
| 526 | vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0)); |
| 527 | |
| 528 | if((start_x2 + j + 1) < end_x2) |
| 529 | { |
| 530 | float data_mr2; |
| 531 | POOL_OP_float(data_mr2, data2.x, data2.y); |
| 532 | POOL_OP_float(data_max.y, data_max.y, data_mr2); |
| 533 | } |
| 534 | else |
| 535 | { |
| 536 | POOL_OP_float(data_max.y, data_max.y, data2.x); |
| 537 | } |
| 538 | } |
| 539 | else |
| 540 | { |
| 541 | vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0)); |
| 542 | vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0)); |
| 543 | if((start_x2 + j + 1) < end_x2) |
| 544 | { |
| 545 | float data_mr2; |
| 546 | POOL_OP_float(data_mr2, data3.x, data2.y); |
| 547 | POOL_OP_float(data_max.y, data_max.y, data_mr2); |
| 548 | } |
| 549 | else |
| 550 | { |
| 551 | POOL_OP_float(data_max.y, data_max.y, data2.y); |
| 552 | } |
| 553 | } |
| 554 | } |
| 555 | } |
| 556 | return data_max; |
| 557 | } |
| 558 | |
| 559 | vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) |
| 560 | { |
| 561 | int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x; |
| 562 | int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; |
| 563 | int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); |
| 564 | int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); |
| 565 | |
| 566 | int start_x2 = start_x1 + stride_x; |
| 567 | int start_y2 = start_y1; |
| 568 | int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); |
| 569 | int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); |
| 570 | |
| 571 | //Initialize sum |
| 572 | float data_total1 = float(0); |
| 573 | float data_total2 = float(0); |
| 574 | for(int i = 0; (start_y1 + i) < end_y1; i++) |
| 575 | for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) |
| 576 | { |
| 577 | vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); |
| 578 | #if defined(POOL_L2) |
| 579 | // Raise to power of 2 for L2 Pooling |
| 580 | data1 = POW2_OP(data1, 2); |
| 581 | #endif /* defined(POOL_L2) */ |
| 582 | //Calculate sum1 |
| 583 | if((start_x1 + j + 1) < end_x1) |
| 584 | { |
| 585 | data_total1 = data_total1 + data1.x + data1.y; |
| 586 | } |
| 587 | else |
| 588 | { |
| 589 | data_total1 = data_total1 + data1.x; |
| 590 | } |
| 591 | |
| 592 | //Calculate sum2 |
| 593 | if((start_x2 + j) < end_x2 && end_x1 <= upper_bound_w) |
| 594 | { |
| 595 | if((stride_x % 2) == 0) |
| 596 | { |
| 597 | vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0)); |
| 598 | #if defined(POOL_L2) |
| 599 | // Raise to power of 2 for L2 Pooling |
| 600 | data2 = POW2_OP(data2, 2); |
| 601 | #endif /* defined(POOL_L2) */ |
| 602 | if((start_x2 + j + 1) < end_x2) |
| 603 | { |
| 604 | data_total2 = data_total2 + data2.x + data2.y; |
| 605 | } |
| 606 | else |
| 607 | { |
| 608 | data_total2 = data_total2 + data2.x; |
| 609 | } |
| 610 | } |
| 611 | else |
| 612 | { |
| 613 | vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0)); |
| 614 | vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0)); |
| 615 | #if defined(POOL_L2) |
| 616 | // Raise to power of 2 for L2 Pooling |
| 617 | data2 = POW2_OP(data2, 2); |
| 618 | data3 = POW2_OP(data3, 2); |
| 619 | #endif /* defined(POOL_L2) */ |
| 620 | if((start_x2 + j + 1) < end_x2) |
| 621 | { |
| 622 | data_total2 = data_total2 + data3.x + data2.y; |
| 623 | } |
| 624 | else |
| 625 | { |
| 626 | data_total2 = data_total2 + data2.y; |
| 627 | } |
| 628 | } |
| 629 | } |
| 630 | } |
| 631 | #if defined(EXCLUDE_PADDING) |
| 632 | start_x1 = max(0, start_x1); |
| 633 | start_y1 = max(0, start_y1); |
| 634 | start_x2 = max(0, start_x2); |
| 635 | start_y2 = max(0, start_y2); |
| 636 | #endif /* defined(EXCLUDE_PADDING) */ |
| 637 | |
| 638 | //Calculate average |
| 639 | vec2 data_avg; |
| 640 | data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1)); |
| 641 | data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2)); |
| 642 | |
| 643 | return data_avg; |
| 644 | } |
| 645 | |
| 646 | #if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7) |
| 647 | |
| 648 | #if defined(POOLING_LAYER_2) |
| 649 | #define POOL_SIZE 2 |
| 650 | #elif defined(POOLING_LAYER_3) |
| 651 | #define POOL_SIZE 3 |
| 652 | #elif defined(POOLING_LAYER_7) |
| 653 | #define POOL_SIZE 7 |
| 654 | #else // POOLING_LAYER_n |
| 655 | #error Please define POOLING_LAYER_N instead. |
| 656 | #endif // POOLING_LAYER_n |
| 657 | |
| 658 | void main(void) |
| 659 | { |
| 660 | // Get pixels pointer |
| 661 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 662 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
| 663 | |
| 664 | //Load and calculate data |
| 665 | vec2 data; |
| 666 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 667 | data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); |
| 668 | #else /*POOL_AVG*/ |
| 669 | data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); |
| 670 | #endif /*POOL_AVG*/ |
| 671 | |
| 672 | #if defined(POOL_L2) |
| 673 | // Take square root of the result in L2 pooling |
| 674 | data = SQRT_OP(data); |
| 675 | #endif /* defined(POOL_L2) */ |
| 676 | |
| 677 | // Store result |
| 678 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data); |
| 679 | } |
| 680 | |
| 681 | #elif defined(POOLING_LAYER_3_OPTIMIZED) |
| 682 | |
| 683 | #define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter) \ |
| 684 | vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 685 | vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ |
| 686 | vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 687 | vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ |
| 688 | vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 689 | vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ |
| 690 | data00 = POW2_OP(data00, 4); \ |
| 691 | data01 = POW2_OP(data01, 2); \ |
| 692 | data10 = POW2_OP(data10, 4); \ |
| 693 | data11 = POW2_OP(data11, 2); \ |
| 694 | data20 = POW2_OP(data20, 4); \ |
| 695 | data21 = POW2_OP(data21, 2); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 696 | \ |
| 697 | vec4 values000; \ |
| 698 | vec4 values001; \ |
| 699 | vec4 values010; \ |
| 700 | vec4 values100; \ |
| 701 | vec4 values101; \ |
| 702 | vec4 values11; \ |
| 703 | vec4 values200; \ |
| 704 | vec4 values201; \ |
| 705 | vec4 values21; \ |
| 706 | values000.xyzw = data00.xyzy; \ |
| 707 | values001.xyzw = data00.zwzw; \ |
| 708 | values010.x = data01.x; \ |
| 709 | values010.y = data00.w; \ |
| 710 | values010.zw = data01.xy; \ |
| 711 | values100.xyzw = data10.xyzy; \ |
| 712 | values101.xyzw = data10.zwzw; \ |
| 713 | values11.x = data11.x; \ |
| 714 | values11.y = data10.w; \ |
| 715 | values11.zw = data11.xy; \ |
| 716 | values200.xyzw = data20.xyzy; \ |
| 717 | values201.xyzw = data20.zwzw; \ |
| 718 | values21.x = data21.x; \ |
| 719 | values21.y = data20.w; \ |
| 720 | values21.zw = data21.xy; \ |
| 721 | POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ |
| 722 | POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ |
| 723 | POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ |
| 724 | POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ |
| 725 | POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ |
| 726 | POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ |
| 727 | POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ |
| 728 | POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) |
| 729 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 730 | #define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter) \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 731 | vec4 data000; \ |
| 732 | vec4 data001; \ |
| 733 | float data010; \ |
| 734 | vec4 data100; \ |
| 735 | vec4 data101; \ |
| 736 | float data11; \ |
| 737 | vec4 data200; \ |
| 738 | vec4 data201; \ |
| 739 | float data21; \ |
| 740 | vec2 datamiddle0; \ |
| 741 | vec2 datamiddle1; \ |
| 742 | vec2 datamiddle2; \ |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 743 | data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 744 | data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ |
| 745 | datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 746 | data010 = datamiddle0.x; \ |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 747 | data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 748 | data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ |
| 749 | datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 750 | data11 = datamiddle1.x; \ |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 751 | data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 752 | data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ |
| 753 | datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 754 | data21 = datamiddle2.x; \ |
| 755 | data000 = POW2_OP(data000, 4); \ |
| 756 | data001 = POW2_OP(data001, 4); \ |
| 757 | data010 = POW2_OP(data010, 1); \ |
| 758 | data100 = POW2_OP(data100, 4); \ |
| 759 | data101 = POW2_OP(data101, 4); \ |
| 760 | data11 = POW2_OP(data11, 1); \ |
| 761 | data200 = POW2_OP(data200, 4); \ |
| 762 | data201 = POW2_OP(data201, 4); \ |
| 763 | data21 = POW2_OP(data21, 1); \ |
| 764 | \ |
| 765 | vec4 values000; \ |
| 766 | vec4 values001; \ |
| 767 | vec4 values010; \ |
| 768 | vec4 values100; \ |
| 769 | vec4 values101; \ |
| 770 | vec4 values11; \ |
| 771 | vec4 values200; \ |
| 772 | vec4 values201; \ |
| 773 | vec4 values21; \ |
| 774 | values000.xyzw = data000.xyzz; \ |
| 775 | values001.xyzw = vec4(data000.w, data001.xxy); \ |
| 776 | values010.xyzw = vec4(data001.zzw, data010); \ |
| 777 | values100.xyzw = data100.xyzz; \ |
| 778 | values101.xyzw = vec4(data100.w, data101.xxy); \ |
| 779 | values11.xyzw = vec4(data101.zzw, data11); \ |
| 780 | values200.xyzw = data200.xyzz; \ |
| 781 | values201.xyzw = vec4(data200.w, data201.xxy); \ |
| 782 | values21.xyzw = vec4(data201.zzw, data21); \ |
| 783 | POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ |
| 784 | POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ |
| 785 | POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ |
| 786 | POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ |
| 787 | POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ |
| 788 | POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ |
| 789 | POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ |
| 790 | POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) |
| 791 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 792 | #define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter) \ |
| 793 | vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ |
| 794 | vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ |
| 795 | vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ |
| 796 | vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ |
| 797 | vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ |
| 798 | vec4 data11 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ |
| 799 | vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ |
| 800 | vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ |
| 801 | vec4 data21 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ |
| 802 | data000 = POW2_OP(data000, 4); \ |
| 803 | data001 = POW2_OP(data001, 4); \ |
| 804 | data010 = POW2_OP(data010, 4); \ |
| 805 | data100 = POW2_OP(data100, 4); \ |
| 806 | data101 = POW2_OP(data101, 4); \ |
| 807 | data11 = POW2_OP(data11, 4); \ |
| 808 | data200 = POW2_OP(data200, 4); \ |
| 809 | data201 = POW2_OP(data201, 4); \ |
| 810 | data21 = POW2_OP(data21, 4); \ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 811 | \ |
| 812 | POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ |
| 813 | POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ |
| 814 | POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ |
| 815 | POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ |
| 816 | POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ |
| 817 | POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ |
| 818 | POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 819 | POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 820 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 821 | void main(void) |
| 822 | { |
| 823 | // Get pixels pointer |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 824 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 825 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 826 | |
| 827 | vec4 res; |
| 828 | // Perform pooling 3x3 for 4 output elements |
| 829 | #if STRIDE_X == 1 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 830 | POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 831 | #elif STRIDE_X == 2 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 832 | POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 833 | #elif STRIDE_X == 3 |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 834 | POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 835 | #endif /*STRIDE_X == 1*/ |
| 836 | |
| 837 | // Divide by pool region in case of average pooling |
| 838 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 839 | ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); |
| 840 | int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; |
| 841 | ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); |
| 842 | int end_y = min((start_y + 3), MAX_HEIGHT); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 843 | #if defined(EXCLUDE_PADDING) |
| 844 | start_x = max(ivec4(0), start_x); |
| 845 | start_y = max(0, start_y); |
| 846 | #endif /* defined(EXCLUDE_PADDING) */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 847 | res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); |
| 848 | #endif /*POOL_AVG*/ |
| 849 | |
| 850 | #if defined(POOL_L2) |
| 851 | // Take square root of the result in L2 pooling |
| 852 | res = SQRT_OP(res); |
| 853 | #endif /* defined(POOL_L2) */ |
| 854 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 855 | VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 856 | } |
| 857 | |
| 858 | #elif defined(POOLING_LAYER_N) |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 859 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 860 | void main(void) |
| 861 | { |
| 862 | // Get pixels pointer |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 863 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 864 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 865 | |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 866 | vec4 vdata00 = vec4(INITIAL_VALUE); |
| 867 | vec4 vdata01 = vec4(INITIAL_VALUE); |
| 868 | vec4 vdata10 = vec4(INITIAL_VALUE); |
| 869 | vec4 vdata11 = vec4(INITIAL_VALUE); |
| 870 | vec2 sdata = vec2(INITIAL_VALUE); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 871 | |
| 872 | for(int y = 0; y < int(POOL_SIZE); y++) |
| 873 | { |
| 874 | int x = 0; |
| 875 | for(; x <= (int(POOL_SIZE) - 8); x += 8) |
| 876 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 877 | vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); |
| 878 | vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 879 | |
| 880 | #if defined(POOL_L2) |
| 881 | // Raise to power of 2 for L2 Pooling |
| 882 | data2 *= data2; |
| 883 | data3 *= data3; |
| 884 | #endif /* defined(POOL_L2) */ |
| 885 | |
| 886 | POOL_OP(vdata00, vdata00, data2); |
| 887 | POOL_OP(vdata10, vdata10, data3); |
| 888 | } |
| 889 | |
| 890 | // Leftover |
| 891 | for(; x < int(POOL_SIZE); x = x + 2) |
| 892 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 893 | vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 894 | #if defined(POOL_L2) |
| 895 | // Raise to power of 2 for L2 Pooling |
| 896 | data4middle *= data4middle; |
| 897 | #endif /* defined(POOL_L2) */ |
| 898 | if((x + 1) >= int(POOL_SIZE)) |
| 899 | { |
| 900 | POOL_OP_float(sdata.x, sdata.x, data4middle.x); |
| 901 | } |
| 902 | else |
| 903 | { |
| 904 | float data4; |
| 905 | POOL_OP_float(data4, data4middle.x, data4middle.y); |
| 906 | POOL_OP_float(sdata.x, sdata.x, data4); |
| 907 | } |
| 908 | } |
| 909 | } |
| 910 | |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 911 | for(int y = 0; y < int(POOL_SIZE); y++) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 912 | { |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 913 | if((STRIDE_X % 2) == 0) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 914 | { |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 915 | int x1 = STRIDE_X; |
| 916 | for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 917 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 918 | vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); |
| 919 | vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2)); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 920 | |
| 921 | #if defined(POOL_L2) |
| 922 | // Raise to power of 2 for L2 Pooling |
| 923 | data2 *= data2; |
| 924 | data3 *= data3; |
| 925 | #endif /* defined(POOL_L2) */ |
| 926 | |
| 927 | POOL_OP(vdata01, vdata01, data2); |
| 928 | POOL_OP(vdata11, vdata11, data3); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 929 | } |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 930 | |
| 931 | // Leftover |
| 932 | for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 933 | { |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 934 | vec2 data4middle; |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 935 | data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 936 | #if defined(POOL_L2) |
| 937 | // Raise to power of 2 for L2 Pooling |
| 938 | data4middle *= data4middle; |
| 939 | #endif /* defined(POOL_L2) */ |
| 940 | if((x1 + 1) >= int(POOL_SIZE + STRIDE_X)) |
| 941 | { |
| 942 | POOL_OP_float(sdata.y, sdata.y, data4middle.x); |
| 943 | } |
| 944 | else |
| 945 | { |
| 946 | float data4; |
| 947 | POOL_OP_float(data4, data4middle.x, data4middle.y); |
| 948 | POOL_OP_float(sdata.y, sdata.y, data4); |
| 949 | } |
| 950 | } |
| 951 | } |
| 952 | else |
| 953 | { |
| 954 | vec2 dataorigin2; |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 955 | dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0)); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 956 | #if defined(POOL_L2) |
| 957 | // Raise to power of 2 for L2 Pooling |
| 958 | dataorigin2.y *= dataorigin2.y; |
| 959 | #endif /* defined(POOL_L2) */ |
| 960 | POOL_OP_float(sdata.y, sdata.y, dataorigin2.y); |
| 961 | |
| 962 | int x1 = STRIDE_X + 1; |
| 963 | for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) |
| 964 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 965 | vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); |
| 966 | vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2)); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 967 | |
| 968 | #if defined(POOL_L2) |
| 969 | // Raise to power of 2 for L2 Pooling |
| 970 | data2 *= data2; |
| 971 | data3 *= data3; |
| 972 | #endif /* defined(POOL_L2) */ |
| 973 | |
| 974 | POOL_OP(vdata01, vdata01, data2); |
| 975 | POOL_OP(vdata11, vdata11, data3); |
| 976 | } |
| 977 | |
| 978 | // Leftover |
| 979 | for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) |
| 980 | { |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 981 | vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 982 | #if defined(POOL_L2) |
| 983 | // Raise to power of 2 for L2 Pooling |
| 984 | data4middle *= data4middle; |
| 985 | #endif /* defined(POOL_L2) */ |
| 986 | if((x1 + 1) >= int(POOL_SIZE + STRIDE_X)) |
| 987 | { |
| 988 | POOL_OP_float(sdata.y, sdata.y, data4middle.x); |
| 989 | } |
| 990 | else |
| 991 | { |
| 992 | float data4; |
| 993 | POOL_OP_float(data4, data4middle.x, data4middle.y); |
| 994 | POOL_OP_float(sdata.y, sdata.y, data4); |
| 995 | } |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 996 | } |
| 997 | } |
| 998 | } |
| 999 | |
| 1000 | //Reduce result |
| 1001 | vec4 reduce40; |
| 1002 | POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw); |
| 1003 | vec2 reduce20; |
| 1004 | POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw); |
| 1005 | vec4 reduce41; |
| 1006 | POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw); |
| 1007 | vec2 reduce21; |
| 1008 | POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw); |
| 1009 | vec2 data; |
| 1010 | POOL_OP_float(data.x, reduce20.x, reduce20.y); |
| 1011 | POOL_OP_float(data.x, data.x, sdata.x); |
| 1012 | POOL_OP_float(data.y, reduce21.x, reduce21.y); |
| 1013 | POOL_OP_float(data.y, data.y, sdata.y); |
| 1014 | |
| 1015 | #if defined(POOL_AVG) || defined(POOL_L2) |
| 1016 | { |
| 1017 | // Divide by pool region in case of average pooling |
Xinghang Zhou | 53a6ec5 | 2017-11-14 15:14:25 +0800 | [diff] [blame] | 1018 | int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * STRIDE_X - PAD_X; |
| 1019 | int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; |
| 1020 | int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH)); |
| 1021 | int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT)); |
| 1022 | int start_x2 = start_x1 + STRIDE_X; |
| 1023 | int start_y2 = start_y1; |
| 1024 | int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH)); |
| 1025 | int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT)); |
| 1026 | #if defined(EXCLUDE_PADDING) |
| 1027 | start_x1 = max(0, start_x1); |
| 1028 | start_y1 = max(0, start_y1); |
| 1029 | start_x2 = max(0, start_x2); |
| 1030 | start_y2 = max(0, start_y2); |
| 1031 | #endif /* defined(EXCLUDE_PADDING) */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 1032 | vec2 res1; |
| 1033 | res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1)); |
| 1034 | res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2)); |
| 1035 | data.x = DIV_OP(data.x, res1.x); |
| 1036 | data.y = DIV_OP(data.y, res1.y); |
| 1037 | } |
| 1038 | #endif /* defined(POOL_AVG) || defined(POOL_L2) */ |
| 1039 | |
| 1040 | #if defined(POOL_L2) |
| 1041 | // Take square root of the result in L2 pooling |
| 1042 | data = SQRT_OP(data); |
| 1043 | #endif /* defined(POOL_L2) */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 1044 | |
| 1045 | // Store result |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 1046 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 1047 | } |
Joel Liang | c5a7e59 | 2017-12-29 14:38:56 +0800 | [diff] [blame] | 1048 | #endif // POOLING_LAYER_N |
| 1049 | |
| 1050 | #else // DATA_TYPE_FP32 |
| 1051 | #error Data type not supported |
| 1052 | #endif // DATA_TYPE_FP32 |