Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 1 | /* |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 2 | * Copyright (c) 2017-2018 ARM Limited. |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 26 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 27 | #include "helpers_cs.h" |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 28 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 29 | #if defined(DATA_TYPE_FP16) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 30 | precision mediump float; |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 31 | #endif // DATA_TYPE_FP16 |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 32 | |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 33 | #ifdef RESHAPE_TO_COLUMNS |
| 34 | |
| 35 | /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. |
| 36 | * |
| 37 | * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" |
| 38 | * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. |
| 39 | * |
| 40 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 |
| 41 | * @param[in] src_attrs The attributes of the source tensor |
| 42 | * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr |
| 43 | * @param[in] dst_attrs The attributes of the destination tensor |
| 44 | * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr |
| 45 | * @param[in] biases_attrs The attributes of the biases tensor |
| 46 | * @param[in] width The width of the input tensor |
| 47 | * @param[in] height The height of the input tensor |
| 48 | * @param[in] depth The depth of the input tensor |
| 49 | * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix |
| 50 | */ |
| 51 | |
| 52 | SHADER_PARAMS_DECLARATION |
| 53 | { |
| 54 | Tensor3DAttributes src_attrs; |
| 55 | ImageAttributes dst_attrs; |
| 56 | #ifdef HAS_BIAS |
| 57 | VectorAttributes biases_attrs; |
| 58 | #endif /* HAS_BIAS */ |
| 59 | uint width; |
| 60 | uint height; |
| 61 | uint depth; |
| 62 | uint total_filters; |
| 63 | }; |
| 64 | |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 65 | #if defined(DATA_TYPE_FP32) |
| 66 | |
| 67 | TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); |
| 68 | TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly); |
| 69 | #ifdef HAS_BIAS |
| 70 | TENSOR_DECLARATION(3, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly); |
| 71 | #endif /* BIAS */ |
| 72 | |
| 73 | void main() |
| 74 | { |
| 75 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 76 | ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 77 | #ifdef HAS_BIAS |
| 78 | VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); |
| 79 | #endif /* BIAS */ |
| 80 | |
| 81 | bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1)) |
| 82 | && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1))); |
| 83 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint( |
| 84 | gl_GlobalInvocationID.z) |
| 85 | * uint(width) * uint(height) * uint(dst_attrs.stride_y)))); |
| 86 | // Linearize convolution elements |
| 87 | if(is_last_thread) |
| 88 | { |
| 89 | for(uint i = 0u; i < uint(total_filters); ++i) |
| 90 | { |
| 91 | float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 92 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0); |
| 93 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 94 | #ifdef HAS_BIAS |
| 95 | float b = LOAD_CURRENT_ITEM(biases_ptr, biases_iter); |
| 96 | STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b); |
| 97 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, biases_attrs.stride_x); |
| 98 | #endif /* HAS_BIAS */ |
| 99 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x); |
| 100 | } |
| 101 | } |
| 102 | else |
| 103 | { |
| 104 | for(uint i = 0u; i < uint(total_filters); ++i) |
| 105 | { |
| 106 | float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 107 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0); |
| 108 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 109 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x); |
| 110 | } |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | #elif defined(DATA_TYPE_FP16) |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 115 | |
| 116 | TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); |
| 117 | TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); |
| 118 | #ifdef HAS_BIAS |
| 119 | TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly); |
| 120 | #endif /* BIAS */ |
| 121 | |
| 122 | void main() |
| 123 | { |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 124 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 125 | ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 126 | #ifdef HAS_BIAS |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 127 | VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 128 | #endif /* BIAS */ |
| 129 | |
| 130 | bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1)) |
| 131 | && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1))); |
| 132 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint( |
| 133 | gl_GlobalInvocationID.z) |
| 134 | * uint(width) * uint(height) * uint(dst_attrs.stride_y)))); |
| 135 | // Linearize convolution elements |
| 136 | if(is_last_thread) |
| 137 | { |
| 138 | for(uint i = 0u; i < uint(total_filters); i = i + 2u) |
| 139 | { |
| 140 | vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); |
| 141 | vec2 s; |
| 142 | if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) |
| 143 | { |
| 144 | s.x = s0.x; |
| 145 | } |
| 146 | else |
| 147 | { |
| 148 | s.x = s0.y; |
| 149 | } |
| 150 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 151 | |
| 152 | vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); |
| 153 | if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) |
| 154 | { |
| 155 | s.y = s1.x; |
| 156 | } |
| 157 | else |
| 158 | { |
| 159 | s.y = s1.y; |
| 160 | } |
| 161 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); |
| 162 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 163 | #ifdef HAS_BIAS |
| 164 | vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter); |
| 165 | STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b); |
| 166 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x)); |
| 167 | #endif /* HAS_BIAS */ |
| 168 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); |
| 169 | } |
| 170 | } |
| 171 | else |
| 172 | { |
| 173 | for(uint i = 0u; i < uint(total_filters); i = i + 2u) |
| 174 | { |
| 175 | vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); |
| 176 | vec2 s; |
| 177 | if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) |
| 178 | { |
| 179 | s.x = s0.x; |
| 180 | } |
| 181 | else |
| 182 | { |
| 183 | s.x = s0.y; |
| 184 | } |
| 185 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 186 | |
| 187 | vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); |
| 188 | if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) |
| 189 | { |
| 190 | s.y = s1.x; |
| 191 | } |
| 192 | else |
| 193 | { |
| 194 | s.y = s1.y; |
| 195 | } |
| 196 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); |
| 197 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); |
| 198 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 203 | #endif /* DATA_TYPE_FP32 */ |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 204 | #endif // RESHAPE_TO_COLUMNS |
| 205 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 206 | #ifdef IM2COL_GENERIC |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 207 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 208 | /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. |
| 209 | * |
| 210 | * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 211 | * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx" |
| 212 | * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx" |
| 213 | * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx" |
| 214 | * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx" |
| 215 | * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx" |
Alex Gilday | 7da29b6 | 2018-03-23 14:16:00 +0000 | [diff] [blame] | 216 | * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx" |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 217 | * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. |
| 218 | * |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 219 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 |
| 220 | * @param[in] src_attrs The attributes of the source tensor |
| 221 | * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr |
| 222 | * @param[in] dst_attrs The attributes of the destination tensor |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 223 | * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). |
| 224 | * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 225 | */ |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 226 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 227 | SHADER_PARAMS_DECLARATION |
| 228 | { |
| 229 | Tensor3DAttributes src_attrs; |
| 230 | ImageAttributes dst_attrs; |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 231 | uint src_stride_w; |
| 232 | uint dst_stride_w; |
| 233 | }; |
| 234 | |
| 235 | #ifdef DATA_TYPE_FP32 |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 236 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 237 | TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); |
| 238 | TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 239 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 240 | void main(void) |
| 241 | { |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 242 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 243 | ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 244 | |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 245 | int xc = int(gl_GlobalInvocationID.x); // x coordinate in the convolved tensor |
| 246 | int yc = int(gl_GlobalInvocationID.y); // y coordinate in the convolved tensor |
| 247 | int ch = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map |
| 248 | int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 249 | |
| 250 | // Calculate input indeces |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 251 | int xi = xc * STRIDE_X - PAD_LEFT; |
| 252 | int yi = yc * STRIDE_Y - PAD_TOP; |
| 253 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w))); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 254 | |
| 255 | // Calculate output indeces |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 256 | int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; |
| 257 | int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution |
| 258 | // sizeof is not available in GLES, so we'll use stride_x |
| 259 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x)); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 260 | |
| 261 | uint src_pos = 0u; |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 262 | |
| 263 | // Linearize convolution elements |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 264 | for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 265 | { |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 266 | for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x))) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 267 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 268 | #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 269 | src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y)); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 270 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); |
| 271 | #else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 272 | if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) |
| 273 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 274 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 275 | } |
| 276 | else |
| 277 | { |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 278 | src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y)); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 279 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 280 | } |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 281 | #endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 282 | } |
| 283 | } |
| 284 | |
| 285 | #ifdef HAS_BIAS |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 286 | if(ch == (KERNEL_DEPTH - 1)) |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 287 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 288 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 289 | } |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 290 | #endif /* HAS_BIAS */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 291 | } |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 292 | |
| 293 | #elif defined(DATA_TYPE_FP16) |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 294 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 295 | TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); |
| 296 | TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); |
| 297 | |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 298 | #ifdef KERNEL_1x1 |
| 299 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 300 | void main(void) |
| 301 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 302 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 303 | ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 304 | |
| 305 | uint xc = gl_GlobalInvocationID.x; |
| 306 | uint yc = gl_GlobalInvocationID.y; |
| 307 | uint zc = gl_GlobalInvocationID.z; |
| 308 | uint ch = zc % uint(KERNEL_DEPTH); // input feature map |
| 309 | uint batch = zc / uint(KERNEL_DEPTH); // the batch |
| 310 | |
| 311 | // Calculate input indeces |
| 312 | uint xi = xc; |
| 313 | uint yi = yc; |
| 314 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z); |
| 315 | |
| 316 | // Calculate output indeces |
| 317 | uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; |
| 318 | uint xo = ch * dst_element_count; |
| 319 | uint yo = xc + yc * uint(CONVOLVED_WIDTH); |
| 320 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); |
| 321 | |
| 322 | bool x_start_even = ((xc % 2u) == 0u); |
| 323 | bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u); |
| 324 | uint input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y); |
| 325 | uint tmp_left = 0u; |
| 326 | uint tmp_right = 0u; |
| 327 | |
| 328 | if(ch % 2u != 0u) |
| 329 | { |
| 330 | return; |
| 331 | } |
| 332 | |
| 333 | if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1)))) |
| 334 | { |
| 335 | tmp_left = LOAD(src_ptr, input_pos); |
| 336 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z); |
| 337 | tmp_right = LOAD(src_ptr, input_pos); |
| 338 | if(x_start_even) |
| 339 | { |
| 340 | tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u); |
| 341 | } |
| 342 | else |
| 343 | { |
| 344 | tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u); |
| 345 | } |
| 346 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 347 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); |
| 348 | |
| 349 | #ifdef HAS_BIAS |
| 350 | if(ch == (uint(KERNEL_DEPTH) - 2u)) |
| 351 | { |
| 352 | mediump vec2 bias_vec = vec2(1.f, 0.f); |
| 353 | uint bias_u = packHalf2x16(bias_vec); |
| 354 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u); |
| 355 | } |
| 356 | #endif /* HAS_BIAS */ |
| 357 | } |
| 358 | else |
| 359 | { |
| 360 | tmp_left = LOAD(src_ptr, input_pos); |
| 361 | if(x_start_even) |
| 362 | { |
| 363 | tmp_right = (tmp_left & 0xffffu); |
| 364 | } |
| 365 | else |
| 366 | { |
| 367 | tmp_right = (tmp_left >> 16u); |
| 368 | } |
| 369 | |
| 370 | #ifdef HAS_BIAS |
| 371 | mediump vec2 bias_vec = vec2(0.f, 1.f); |
| 372 | uint bias_u = packHalf2x16(bias_vec); |
| 373 | tmp_right += (bias_u & 0xffff0000u); |
| 374 | #endif /* HAS_BIAS */ |
| 375 | |
| 376 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 377 | } |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 378 | } |
| 379 | |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 380 | #else /* KERNEL_1x1 */ |
| 381 | |
| 382 | void main(void) |
| 383 | { |
| 384 | uint xc = gl_GlobalInvocationID.x; |
| 385 | uint yc = gl_GlobalInvocationID.y; |
| 386 | uint zc = gl_GlobalInvocationID.z; |
| 387 | uint ch = zc % uint(KERNEL_DEPTH); // input feature map |
| 388 | uint batch = zc / uint(KERNEL_DEPTH); // the batch |
| 389 | |
| 390 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 391 | Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 392 | ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 393 | |
| 394 | // Calculate input indeces |
| 395 | uint src_element_count = src_attrs.step_x / src_attrs.stride_x; |
| 396 | uint xi = (xc * uint(STRIDE_X)) / src_element_count; |
| 397 | uint yi = yc * uint(STRIDE_Y); |
| 398 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z); |
| 399 | |
| 400 | // Calculate output indeces |
| 401 | uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; |
| 402 | uint xo = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count; |
| 403 | uint yo = xc + yc * uint(CONVOLVED_WIDTH); |
| 404 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); |
| 405 | |
| 406 | bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u); |
| 407 | bool z_start_even = ((ch % 2u) == 0u); |
| 408 | uint input_pos = 0u; |
| 409 | uint tmp = 0u; |
| 410 | uint tmp_left = 0u; |
| 411 | uint tmp_right = 0u; |
| 412 | |
| 413 | // Linearize convolution elements |
| 414 | for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) |
| 415 | { |
| 416 | uint xstart = 0u; |
| 417 | uint xend = 0u; |
| 418 | |
| 419 | // even col, even row |
| 420 | if(x_start_even) |
| 421 | { |
| 422 | if(((y - yi + ch) % 2u) == 0u) |
| 423 | { |
| 424 | for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) |
| 425 | { |
| 426 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); |
| 427 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); |
| 428 | } |
| 429 | } |
| 430 | else |
| 431 | { |
| 432 | // 1st pair |
| 433 | if(!z_start_even && (y == yi)) |
| 434 | { |
| 435 | // cross 2d feature map |
| 436 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + |
| 437 | (ch - 1u) * src_attrs.stride_z); |
| 438 | } |
| 439 | else |
| 440 | { |
| 441 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, |
| 442 | (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); |
| 443 | } |
| 444 | tmp_right = LOAD(src_ptr, input_pos); |
| 445 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); |
| 446 | tmp_left = LOAD(src_ptr, input_pos); |
| 447 | tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u); |
| 448 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 449 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); |
| 450 | |
| 451 | // remaining |
| 452 | for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) |
| 453 | { |
| 454 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y); |
| 455 | tmp_left = LOAD(src_ptr, input_pos); |
| 456 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); |
| 457 | tmp_right = LOAD(src_ptr, input_pos); |
| 458 | tmp_right = (tmp_left >> 16u) + (tmp_right << 16u); |
| 459 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 460 | } |
| 461 | } |
| 462 | } |
| 463 | else |
| 464 | { |
| 465 | if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even)) |
| 466 | { |
| 467 | // 1st pair |
| 468 | if(y == yi) |
| 469 | { |
| 470 | // cross 2d feature map |
| 471 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + |
| 472 | (ch - 1u) * src_attrs.stride_z); |
| 473 | } |
| 474 | else |
| 475 | { |
| 476 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, |
| 477 | (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); |
| 478 | } |
| 479 | |
| 480 | tmp_right = LOAD(src_ptr, input_pos); |
| 481 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); |
| 482 | tmp_left = LOAD(src_ptr, input_pos); |
| 483 | tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u); |
| 484 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 485 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); |
| 486 | |
| 487 | // remaining |
| 488 | for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) |
| 489 | { |
| 490 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); |
| 491 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); |
| 492 | } |
| 493 | } |
| 494 | else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even)) |
| 495 | { |
| 496 | // 1st pair |
| 497 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); |
| 498 | tmp_right = LOAD(src_ptr, input_pos); |
| 499 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y); |
| 500 | tmp_left = LOAD(src_ptr, input_pos); |
| 501 | tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); |
| 502 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 503 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); |
| 504 | |
| 505 | // remaining |
| 506 | for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) |
| 507 | { |
| 508 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); |
| 509 | tmp_right = LOAD(src_ptr, input_pos); |
| 510 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y); |
| 511 | tmp_left = LOAD(src_ptr, input_pos); |
| 512 | tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); |
| 513 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); |
| 514 | } |
| 515 | } |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | // NOTE: must handle last element manually instead of in loops |
| 520 | // to avoid write conflict across 2d boundary |
| 521 | if(ch == uint(KERNEL_DEPTH) - 1u) |
| 522 | { |
| 523 | uint x = xi + (uint(KERNEL_WIDTH) / 2u); |
| 524 | uint y = yi + uint(KERNEL_HEIGHT) - 1u; |
| 525 | input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); |
| 526 | tmp = LOAD(src_ptr, input_pos); |
| 527 | if(!x_start_even) |
| 528 | { |
| 529 | tmp = (tmp >> 16u) + (tmp << 16u); |
| 530 | } |
| 531 | |
| 532 | #ifdef HAS_BIAS |
| 533 | mediump vec2 bias_vec = vec2(1.f, 1.f); |
| 534 | uint bias_u = packHalf2x16(bias_vec); |
| 535 | if(z_start_even) |
| 536 | { |
| 537 | tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u); |
| 538 | } |
| 539 | else |
| 540 | { |
| 541 | tmp = (bias_u & 0xffffu); |
| 542 | } |
| 543 | #endif /* HAS_BIAS */ |
| 544 | |
| 545 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp); |
| 546 | } |
| 547 | } |
| 548 | |
| 549 | #endif /* KERNEL_1x1 */ |
| 550 | #else /* DATA_TYPE_FP32 */ |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 551 | #error Data type not supported |
| 552 | #endif /* DATA_TYPE_FP32 */ |
| 553 | #endif /* IM2COL_GENERIC */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 554 | |
| 555 | #ifdef IM2COL_REDUCED |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 556 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 557 | /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation |
| 558 | * |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 559 | * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 560 | * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. |
| 561 | * |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 562 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 |
| 563 | * @param[in] src_attrs The attributes of the source tensor |
| 564 | * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr |
| 565 | * @param[in] dst_attrs The attributes of the destination tensor |
| 566 | * @param[in] width The width of the input tensor |
| 567 | * @param[in] height The height of the input tensor |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 568 | */ |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 569 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 570 | SHADER_PARAMS_DECLARATION |
| 571 | { |
| 572 | Tensor3DAttributes src_attrs; |
| 573 | VectorAttributes dst_attrs; |
| 574 | uint width; |
| 575 | uint height; |
| 576 | }; |
| 577 | |
| 578 | #ifdef DATA_TYPE_FP32 |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 579 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 580 | TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); |
| 581 | TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); |
| 582 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 583 | void main(void) |
| 584 | { |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 585 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 586 | VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 587 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 588 | uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); |
| 589 | uvec3 size = uvec3(gl_WorkGroupSize.xyz); |
| 590 | uint image_size = width * height; |
| 591 | uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x + pos.y * width + pos.z * image_size); |
| 592 | |
| 593 | STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 594 | |
| 595 | #ifdef HAS_BIAS |
| 596 | // If it is the last thread in the 3 dimensional workgroup |
| 597 | if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1)) |
| 598 | { |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 599 | tmp_out_offset += (dst_attrs.stride_x >> uint(2)); |
| 600 | STORE(dst_ptr, tmp_out_offset, 1.f); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 601 | } |
| 602 | #endif // HAS_BIAS |
| 603 | } |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 604 | |
| 605 | #elif defined(DATA_TYPE_FP16) |
| 606 | |
| 607 | #if defined(IM2COL_REDUCED_8X) |
| 608 | TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); |
| 609 | TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, restrict); |
| 610 | #elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */ |
| 611 | TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly); |
| 612 | TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, restrict); |
| 613 | #else /* IM2COL_REDUCED_8X */ |
| 614 | TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); |
| 615 | TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); |
| 616 | #endif /* IM2COL_REDUCED_8X */ |
| 617 | |
| 618 | #if defined(IM2COL_REDUCED_GENERIC) |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 619 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 620 | void main(void) |
| 621 | { |
| 622 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 623 | Tensor3DIterator src_nostep_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 624 | VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 625 | |
| 626 | uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); |
| 627 | uvec3 size = uvec3(gl_WorkGroupSize.xyz); |
| 628 | uint image_size = width * height; |
| 629 | uint element_count = src_attrs.step_x / src_attrs.stride_x; |
| 630 | uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * element_count + pos.y * width + pos.z * image_size); |
| 631 | uint width_fp16 = (width + uint(1)) >> uint(1); |
| 632 | uint tmp; |
| 633 | |
| 634 | // odd width |
| 635 | if(width % uint(2) != uint(0)) |
| 636 | { |
| 637 | // even row |
| 638 | if((pos.y + pos.z * height) % uint(2) == uint(0)) |
| 639 | { |
steli01 | 4df0575 | 2018-01-30 09:49:07 +0800 | [diff] [blame] | 640 | // skip last element of each line to avoid write conflict except for last line |
| 641 | if((pos.x < (width / element_count)) || ((pos.y == gl_NumWorkGroups.y - 1u) && (pos.z == gl_NumWorkGroups.z - 1u))) |
| 642 | { |
| 643 | tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 644 | STORE(dst_ptr, tmp_out_offset, tmp); |
| 645 | } |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 646 | } |
| 647 | else |
| 648 | { |
| 649 | // special op |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 650 | uint tmp_left = uint(0); |
| 651 | uint tmp_right = uint(0); |
| 652 | tmp_right = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 653 | if(pos.x == uint(0)) |
| 654 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 655 | tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half |
| 656 | tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16)); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 657 | } |
| 658 | else |
| 659 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 660 | tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z))); |
| 661 | tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16))); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 662 | } |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 663 | STORE(dst_ptr, tmp_out_offset, tmp_right); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 664 | } |
| 665 | } |
| 666 | else |
| 667 | { |
| 668 | tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 669 | STORE(dst_ptr, tmp_out_offset, tmp); |
steli01 | 4df0575 | 2018-01-30 09:49:07 +0800 | [diff] [blame] | 670 | } |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 671 | |
| 672 | #ifdef HAS_BIAS |
steli01 | 4df0575 | 2018-01-30 09:49:07 +0800 | [diff] [blame] | 673 | // If it is the last thread in the 3 dimensional workgroup |
| 674 | if(pos.x == (size.x - 1u) && pos.y == (size.y - 1u) && pos.z == (size.z - 1u)) |
| 675 | { |
| 676 | tmp_out_offset += (dst_attrs.stride_x >> dst_shift); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 677 | |
steli01 | 4df0575 | 2018-01-30 09:49:07 +0800 | [diff] [blame] | 678 | // FIXME: need odd/even detection for tmp_out_offset? |
| 679 | mediump vec2 bias_vec = vec2(1.0f, 1.0f); |
| 680 | STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 681 | } |
steli01 | 4df0575 | 2018-01-30 09:49:07 +0800 | [diff] [blame] | 682 | #endif // HAS_BIAS |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 683 | } |
| 684 | |
| 685 | #else /* IM2COL_REDUCED_GENERIC */ |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 686 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 687 | void main(void) |
| 688 | { |
| 689 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); |
| 690 | VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift); |
| 691 | |
| 692 | uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); |
| 693 | #if defined(IM2COL_REDUCED_8X) |
| 694 | uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)); |
| 695 | uvec4 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 696 | STORE(dst_ptr, tmp_out_offset, tmp); |
| 697 | #elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */ |
| 698 | uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)); |
| 699 | uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 700 | STORE(dst_ptr, tmp_out_offset, tmp); |
| 701 | #else /* IM2COL_REDUCED_8X */ |
| 702 | uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)); |
| 703 | uint tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter); |
| 704 | STORE(dst_ptr, tmp_out_offset, tmp); |
| 705 | #endif /* IM2COL_REDUCED_8X */ |
| 706 | } |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 707 | |
| 708 | #endif /* IM2COL_REDUCED_GENERIC */ |
| 709 | #else /* DATA_TYPE_FP32 */ |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 710 | #error Data type not supported |
| 711 | #endif /* DATA_TYPE_FP32 */ |
| 712 | #endif /* IM2COL_REDUCED */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 713 | |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 714 | #ifdef COL2IM |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 715 | #ifdef WIDTH_OUTPUT |
| 716 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 717 | /** This kernel performs a reshaping of the output of the convolution layer. |
| 718 | * |
| 719 | * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" |
| 720 | * |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 721 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 |
| 722 | * @param[in] src_attrs The attributes of the source tensor |
| 723 | * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr |
| 724 | * @param[in] dst_attrs The attributes of the destination tensor |
| 725 | * @param[in] dst_depth The length of the destination tensor in Z dimension |
| 726 | * @param[in] dst_strideZ The actual stride of the destination tensor in Z dimension |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 727 | */ |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 728 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 729 | SHADER_PARAMS_DECLARATION |
| 730 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 731 | Tensor3DAttributes src_attrs; |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 732 | Tensor3DAttributes dst_attrs; |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 733 | uint dst_depth; |
| 734 | uint dst_strideZ; |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 735 | }; |
| 736 | |
| 737 | #ifdef DATA_TYPE_FP32 |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 738 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 739 | TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); |
| 740 | TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 741 | |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 742 | void main(void) |
| 743 | { |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 744 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 745 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 746 | |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 747 | uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 748 | TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 749 | |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 750 | STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter)); |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 751 | } |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 752 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 753 | #elif defined(DATA_TYPE_FP16) |
| 754 | |
Stephen Li | e855c23 | 2018-01-04 14:13:22 +0800 | [diff] [blame] | 755 | TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); |
| 756 | TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); |
| 757 | |
| 758 | void main(void) |
| 759 | { |
| 760 | Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); |
| 761 | Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); |
| 762 | |
| 763 | uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); |
| 764 | |
| 765 | if((pos.z % dst_depth) % 2u == 0u) |
| 766 | { |
| 767 | uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ; |
| 768 | uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); |
| 769 | uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); |
| 770 | vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); |
| 771 | vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); |
| 772 | vec2 result = vec2(tmp1.x, tmp2.x); |
| 773 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); |
| 774 | } |
| 775 | else |
| 776 | { |
| 777 | uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u; |
| 778 | uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); |
| 779 | uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); |
| 780 | vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); |
| 781 | vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); |
| 782 | vec2 result = vec2(tmp1.y, tmp2.y); |
| 783 | STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); |
| 784 | } |
| 785 | } |
| 786 | |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 787 | #else /* DATA_TYPE_FP32 */ |
Anthony Barbier | 7068f99 | 2017-10-26 15:23:08 +0100 | [diff] [blame] | 788 | #error Data type not supported |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 789 | #endif /* DATA_TYPE_FP32 */ |
Michele Di Giorgio | fc1d1e2 | 2018-04-10 14:24:35 +0100 | [diff] [blame] | 790 | #endif /* WIDTH_OUTPUT */ |
zhenglin | 57b2010 | 2018-01-05 14:39:50 +0800 | [diff] [blame] | 791 | #endif /* COL2IM */ |