Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 1 | /* |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 2 | * Copyright (c) 2018-2019 ARM Limited. |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | /* |
| 26 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 27 | * |
| 28 | * NOTE: Header to be included by implementation files only. |
| 29 | * |
| 30 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 31 | */ |
| 32 | |
| 33 | #include <algorithm> |
| 34 | #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" |
| 35 | #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" |
| 36 | |
| 37 | #pragma once |
| 38 | |
| 39 | namespace depthwise |
| 40 | { |
| 41 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 42 | const unsigned int CHANNEL_BLOCK = 16; |
| 43 | |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 44 | namespace |
| 45 | { |
| 46 | inline int pad_along_dim( |
| 47 | const bool padding_same, |
| 48 | const int kernel_dim, |
| 49 | const int stride_dim, |
| 50 | const int input_dim |
| 51 | ) |
| 52 | { |
| 53 | if (!padding_same) |
| 54 | return 0; |
| 55 | if (input_dim % stride_dim) |
| 56 | return std::max(kernel_dim - (input_dim % stride_dim), 0); |
| 57 | else |
| 58 | return std::max(kernel_dim - stride_dim, 0); |
| 59 | } |
| 60 | } // namespace |
| 61 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 62 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 63 | int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size( |
| 64 | const int dim_size, const bool same_padding |
| 65 | ) |
| 66 | { |
| 67 | return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR); |
| 68 | } |
| 69 | |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 70 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 71 | int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size( |
| 72 | const int dim_size, const unsigned int padding_before, const unsigned int padding_after |
| 73 | ) |
| 74 | { |
| 75 | return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR); |
| 76 | } |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 77 | |
| 78 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 79 | DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( |
| 80 | const int n_batches, const int n_input_rows, const int n_input_cols, |
| 81 | const int n_channels, const bool padding_same, |
| 82 | const TIn* const weights, |
| 83 | const TIn* const input, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 84 | TOut* const output, |
| 85 | const int weight_col_stride, |
| 86 | const int weight_row_stride, |
| 87 | const int input_col_stride, |
| 88 | const int input_row_stride, |
| 89 | const int input_batch_stride, |
| 90 | const int output_col_stride, |
| 91 | const int output_row_stride, |
| 92 | const int output_batch_stride |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 93 | ) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>( |
| 94 | n_batches, n_input_rows, n_input_cols, |
| 95 | n_channels, |
| 96 | pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */ |
| 97 | pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */ |
| 98 | iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */ |
| 99 | iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */ |
| 100 | weights, input, output, |
| 101 | weight_col_stride, weight_row_stride, |
| 102 | input_col_stride, input_row_stride, input_batch_stride, |
| 103 | output_col_stride, output_row_stride, output_batch_stride |
| 104 | ) |
| 105 | { |
| 106 | } |
| 107 | |
| 108 | |
| 109 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 110 | DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( |
| 111 | const int n_batches, const int n_input_rows, const int n_input_cols, |
| 112 | const int n_channels, |
| 113 | const unsigned int padding_top, |
| 114 | const unsigned int padding_left, |
| 115 | const unsigned int padding_bottom, |
| 116 | const unsigned int padding_right, |
| 117 | const TIn* const weights, |
| 118 | const TIn* const input, |
| 119 | TOut* const output, |
| 120 | const int weight_col_stride, |
| 121 | const int weight_row_stride, |
| 122 | const int input_col_stride, |
| 123 | const int input_row_stride, |
| 124 | const int input_batch_stride, |
| 125 | const int output_col_stride, |
| 126 | const int output_row_stride, |
| 127 | const int output_batch_stride |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 128 | ) : _weights(weights), _input(input), _output(output), |
| 129 | _n_batches(n_batches), |
| 130 | _n_input_rows(n_input_rows), |
| 131 | _n_input_cols(n_input_cols), |
| 132 | _n_channels(n_channels), |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 133 | _n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)), |
| 134 | _n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)), |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 135 | _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)), |
| 136 | _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)), |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 137 | _padding_top(padding_top), |
| 138 | _padding_left(padding_left), |
| 139 | _padding_bottom(padding_bottom), |
| 140 | _padding_right(padding_right), |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 141 | _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels), |
| 142 | _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride), |
| 143 | _input_col_stride(input_col_stride ? input_col_stride : _n_channels), |
| 144 | _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride), |
| 145 | _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), |
| 146 | _output_col_stride(output_col_stride ? output_col_stride : _n_channels), |
| 147 | _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 148 | _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride), |
| 149 | _input_offset(0), _weights_offset(0) |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 150 | { |
| 151 | } |
| 152 | |
| 153 | |
| 154 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 155 | unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const |
| 156 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 157 | // Parallelise over blocks of channels. |
| 158 | return iceildiv(_n_channels, CHANNEL_BLOCK); |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 159 | } |
| 160 | |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 161 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 162 | void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset) |
| 163 | { |
| 164 | _input_offset = input_offset; |
| 165 | _weights_offset = weights_offset; |
| 166 | } |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 167 | |
| 168 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 169 | void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run( |
| 170 | const unsigned int start, |
| 171 | const unsigned int stop |
| 172 | ) |
| 173 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 174 | // Parallelise over blocks of channels |
| 175 | const auto start_channel = CHANNEL_BLOCK * start; |
| 176 | const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop); |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 177 | |
| 178 | // Compute top and bottom padding for input and output |
Georgios Pinitas | 1a57ad1 | 2019-01-09 16:11:51 +0000 | [diff] [blame^] | 179 | const int input_pad_top = _padding_top; |
| 180 | const int input_pad_left = _padding_left; |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 181 | constexpr int tile_overlap = kernel_rows - stride_rows; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 182 | |
| 183 | // Perform the convolution by calling `process_tile_row` for each tile row in |
| 184 | // each batch. |
| 185 | for (int batch = 0; batch < _n_batches; batch++) |
| 186 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 187 | const TIn* const inptr_batch = _input + batch*_input_batch_stride; |
| 188 | TOut* const outptr_batch = _output + batch*_output_batch_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 189 | |
| 190 | // Loop over rows of tiles |
| 191 | for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++) |
| 192 | { |
| 193 | // Pointer to the row |
| 194 | const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top; |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 195 | const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride); |
| 196 | TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 197 | |
| 198 | // Input padding (top + bottom) for the row |
| 199 | const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top; |
| 200 | const int input_row_bottom = input_row_top + inner_tile_rows; |
| 201 | const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0; |
| 202 | const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows); |
| 203 | |
| 204 | // Output padding (bottom) for the row |
| 205 | const int output_row_bottom = (tile_i + 1)*output_tile_rows; |
| 206 | const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); |
| 207 | |
| 208 | // Process the row |
| 209 | process_tile_row( |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 210 | stop_channel - start_channel, |
| 211 | _weights + start_channel, _weight_row_stride, _weight_col_stride, |
| 212 | inptr_row + start_channel, _input_row_stride, _input_col_stride, |
| 213 | outptr_row + start_channel, _output_row_stride, _output_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 214 | input_row_pad_top, input_pad_left, input_row_pad_bottom, |
| 215 | output_row_pad_bottom, |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 216 | _n_tile_cols, _n_input_cols, _n_output_cols, |
| 217 | _input_offset, _weights_offset |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 218 | ); |
| 219 | } |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | |
| 224 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 225 | void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row( |
| 226 | const int n_channels, |
| 227 | const TIn* const weights, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 228 | const int weight_row_stride, |
| 229 | const int weight_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 230 | const TIn* const inptr, |
| 231 | const int in_row_stride, |
| 232 | const int in_col_stride, |
| 233 | TOut* const outptr, |
| 234 | const int out_row_stride, |
| 235 | const int out_col_stride, |
| 236 | const int row_pad_in_top, |
| 237 | const int row_pad_in_left, |
| 238 | const int row_pad_in_bottom, |
| 239 | const int row_pad_out_bottom, |
| 240 | const int n_tiles, |
| 241 | const int n_input_cols, |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 242 | const int n_output_cols, |
| 243 | const int input_offset, |
| 244 | const int weights_offset |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 245 | ) |
| 246 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 247 | constexpr int tile_overlap = kernel_cols - stride_cols; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 248 | |
| 249 | // Loop over columns of tiles |
| 250 | for (int tile_j = 0; tile_j < n_tiles; tile_j++) |
| 251 | { |
| 252 | // Input padding (left + right) for the tile |
| 253 | const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0; |
| 254 | const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left; |
| 255 | const int t_in_end = t_in_start + inner_tile_cols; |
| 256 | const int t_pad_in_right = std::max(0, t_in_end - n_input_cols); |
| 257 | |
| 258 | // Output padding (right) for the tile |
| 259 | const int t_out_end = (tile_j + 1) * output_tile_cols; |
| 260 | const int t_pad_out_right = std::max(0, t_out_end - n_output_cols); |
| 261 | |
| 262 | // Get pointers into the inputs and outputs |
| 263 | const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; |
| 264 | const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride); |
| 265 | TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride; |
| 266 | |
| 267 | // Apply the specific tile processing function |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 268 | const bool pad_top = row_pad_in_top > 0; |
| 269 | const bool pad_left = t_pad_in_left > 0; |
| 270 | const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom; |
| 271 | const bool pad_right = t_pad_in_right || t_pad_out_right; |
| 272 | |
| 273 | const TileFn tilefn = [&] () { |
| 274 | if (!pad_top && !pad_left && !pad_bottom && !pad_right) |
| 275 | { |
| 276 | // No padding |
| 277 | return tilefn_unpadded; |
| 278 | } |
| 279 | else if (pad_top && !pad_left && !pad_bottom && !pad_right) |
| 280 | { |
| 281 | // Padding on the top only, subtract off the minimum expected padding in |
| 282 | // order to index into the array of specialised methods. |
| 283 | const int index = row_pad_in_top - min_in_pad_top; |
| 284 | return tilefn_top[index]; |
| 285 | } |
| 286 | else if (!pad_top && pad_left && !pad_bottom && !pad_right) |
| 287 | { |
| 288 | // Padding on the left only, subtract off the minimum expected padding in |
| 289 | // order to index into the array of specialised methods. |
| 290 | const int index = t_pad_in_left - min_in_pad_left; |
| 291 | return tilefn_left[index]; |
| 292 | } |
| 293 | else if (!pad_top && !pad_left && pad_bottom && !pad_right) |
| 294 | { |
| 295 | // Padding on the bottom only |
| 296 | return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom]; |
| 297 | } |
| 298 | else if (!pad_top && !pad_left && !pad_bottom && pad_right) |
| 299 | { |
| 300 | // Padding on the right only |
| 301 | return tilefn_right[t_pad_in_right][t_pad_out_right]; |
| 302 | } |
| 303 | else |
| 304 | { |
| 305 | // Otherwise use generic tile processing method. |
| 306 | return tilefn_generic; |
| 307 | } |
| 308 | }(); |
| 309 | |
| 310 | tilefn( |
| 311 | n_channels, |
| 312 | weights, weight_row_stride, weight_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 313 | inptr_col, in_row_stride, in_col_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 314 | outptr_col, out_row_stride, out_col_stride, |
| 315 | row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 316 | row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 317 | ); |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 322 | // New templated struct used solely as a way to provide tile processing |
| 323 | // specialisations. |
| 324 | template <int OutputTileRows, int OutputTileCols, |
| 325 | int KernelRows, int KernelCols, |
| 326 | int StrideRows, int StrideCols, |
| 327 | typename TIn, typename TOut> |
| 328 | struct DepthwiseConvolutionImpl : public DepthwiseConvolution< |
| 329 | OutputTileRows, OutputTileCols, |
| 330 | KernelRows, KernelCols, |
| 331 | StrideRows, StrideCols, TIn, TOut |
| 332 | > |
| 333 | { |
| 334 | typedef DepthwiseConvolution< |
| 335 | OutputTileRows, OutputTileCols, |
| 336 | KernelRows, KernelCols, |
| 337 | StrideRows, StrideCols, |
| 338 | TIn, TOut |
| 339 | > DWC; |
| 340 | |
| 341 | /** Perform the depthwise convolution of a tile. |
| 342 | * |
| 343 | * @param[in] n_channels Number of channels. |
| 344 | * @param[in] weights Pointer to Height x Width x Channels ordered weights. |
| 345 | * @param[in] inptr Pointer to the top-left unpadded value of the tile. |
| 346 | * @param[in] in_row_stride Stride between rows of the input tensor. |
| 347 | * @param[in] in_col_stride Stride between columns of the input tensor. |
| 348 | * @param[out] outptr Pointer to the top-left output value for the tile. |
| 349 | * @param[in] out_row_stride Stride between rows of the output tensor. |
| 350 | * @param[in] out_col_stride Stride between columns of the output tensor. |
| 351 | * |
| 352 | * The following parameters may be ignored if the function has been |
| 353 | * specialised for specific padding constraints. |
| 354 | * |
| 355 | * @param[in] _in_pad_top Padding to apply to top of input tile. |
| 356 | * @param[in] _in_pad_left Padding to apply to left of input tile. |
| 357 | * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. |
| 358 | * @param[in] _in_pad_right Padding to apply to right of input tile. |
| 359 | * @param[in] _out_pad_bottom Null cells at bottom of output tile. |
| 360 | * @param[in] _out_pad_right Null cells at right of output tile. |
| 361 | */ |
| 362 | template < |
| 363 | bool Specialize=false, // Specialize (or not) the method |
| 364 | int InPadTop=0, // If specialized, top padding |
| 365 | int InPadLeft=0, // If specialized, left padding |
| 366 | int InPadBottom=0, // If specialized, bottom padding |
| 367 | int InPadRight=0, // If specialized, right padding |
| 368 | int OutPadBottom=0, // If specialized, bottom output padding |
| 369 | int OutPadRight=0 // If specialized, bottom right padding |
| 370 | > |
| 371 | static void process_tile( |
| 372 | const int n_channels, |
| 373 | const TIn* const weights, |
| 374 | const int weight_row_stride, |
| 375 | const int weight_col_stride, |
| 376 | const TIn* const inptr, |
| 377 | const int in_row_stride, |
| 378 | const int in_col_stride, |
| 379 | TOut* const outptr, |
| 380 | const int out_row_stride, |
| 381 | const int out_col_stride, |
| 382 | const int in_pad_top=0, |
| 383 | const int in_pad_left=0, |
| 384 | const int in_pad_bottom=0, |
| 385 | const int in_pad_right=0, |
| 386 | const int out_pad_bottom=0, |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 387 | const int out_pad_right=0, |
| 388 | const int input_offset=0, |
| 389 | const int weights_offset=0 |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 390 | ); |
| 391 | }; |
| 392 | |
| 393 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 394 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 395 | template < |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 396 | bool Specialize, |
| 397 | int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, |
| 398 | int OutPadBottom, int OutPadRight |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 399 | > |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 400 | void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile( |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 401 | const int n_channels, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 402 | const TIn *__restrict__ const weights, |
| 403 | const int weight_row_stride, |
| 404 | const int weight_col_stride, |
| 405 | const TIn *__restrict__ const inptr, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 406 | const int in_row_stride, |
| 407 | const int in_col_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 408 | TOut *__restrict__ const outptr, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 409 | const int out_row_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 410 | const int out_col_stride, |
| 411 | const int _in_pad_top, |
| 412 | const int _in_pad_left, |
| 413 | const int _in_pad_bottom, |
| 414 | const int _in_pad_right, |
| 415 | const int _out_pad_bottom, |
Georgios Pinitas | a799ce0 | 2018-09-12 20:11:34 +0100 | [diff] [blame] | 416 | const int _out_pad_right, |
| 417 | const int _input_offset, |
| 418 | const int _weights_offset |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 419 | ) |
| 420 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 421 | constexpr auto inner_tile_rows = DWC::inner_tile_rows; |
| 422 | constexpr auto inner_tile_cols = DWC::inner_tile_cols; |
| 423 | constexpr auto kernel_rows = DWC::kernel_rows; |
| 424 | constexpr auto kernel_cols = DWC::kernel_cols; |
| 425 | constexpr auto output_tile_rows = DWC::output_tile_rows; |
| 426 | constexpr auto output_tile_cols = DWC::output_tile_cols; |
| 427 | constexpr auto stride_rows = DWC::stride_rows; |
| 428 | constexpr auto stride_cols = DWC::stride_cols; |
| 429 | |
| 430 | // Extract parameters |
| 431 | const int in_pad_top = Specialize ? InPadTop : _in_pad_top; |
| 432 | const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; |
| 433 | const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; |
| 434 | const int in_pad_right = Specialize ? InPadRight : _in_pad_right; |
| 435 | const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; |
| 436 | const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; |
| 437 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 438 | // Compute valid ranges of the tile |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 439 | const int in_cells_i = inner_tile_rows - in_pad_bottom; |
| 440 | const int in_cells_j = inner_tile_cols - in_pad_right; |
| 441 | const int out_cells_i = output_tile_rows - out_pad_bottom; |
| 442 | const int out_cells_j = output_tile_cols - out_pad_right; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 443 | |
| 444 | // Instantiate pointers |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 445 | const TIn* __restrict__ inptr_base = inptr; |
| 446 | const TIn* __restrict__ wptr_base = weights; |
| 447 | TOut* __restrict__ outptr_base = outptr; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 448 | |
| 449 | // Perform the depthwise convolution |
| 450 | int channels_remaining = n_channels; |
| 451 | for (; channels_remaining; channels_remaining--) |
| 452 | { |
| 453 | // Load input tile |
| 454 | TIn u[inner_tile_rows][inner_tile_cols]; |
| 455 | for (int i = 0; i < inner_tile_rows; i++) |
| 456 | { |
| 457 | const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; |
| 458 | for (int j = 0; j < inner_tile_cols; j++) |
| 459 | { |
| 460 | if (i < in_pad_top || in_cells_i <= i || |
| 461 | j < in_pad_left || in_cells_j <= j) |
| 462 | { |
| 463 | u[i][j] = static_cast<TIn>(0); |
| 464 | } |
| 465 | else |
| 466 | { |
| 467 | u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride); |
| 468 | } |
| 469 | } |
| 470 | } |
| 471 | inptr_base++; |
| 472 | |
| 473 | // Load weights tile |
| 474 | TIn w[kernel_rows][kernel_cols]; |
| 475 | for (int i = 0; i < kernel_rows; i++) |
| 476 | { |
| 477 | const TIn* const wptr_row = wptr_base + i*weight_row_stride; |
| 478 | for (int j = 0; j < kernel_cols; j++) |
| 479 | { |
| 480 | w[i][j] = *(wptr_row + j*weight_col_stride); |
| 481 | } |
| 482 | } |
| 483 | wptr_base++; |
| 484 | |
| 485 | // Perform the convolution |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 486 | TOut v[output_tile_rows][output_tile_cols]; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 487 | for (int out_i = 0; out_i < out_cells_i; out_i++) |
| 488 | { |
| 489 | for (int out_j = 0; out_j < out_cells_j; out_j++) |
| 490 | { |
| 491 | // Clear the accumulator |
| 492 | v[out_i][out_j] = static_cast<TOut>(0); |
| 493 | |
| 494 | // Base co-ordinate |
| 495 | const int base_i = out_i * stride_rows; |
| 496 | const int base_j = out_j * stride_cols; |
| 497 | |
| 498 | // Fill the accumulator |
| 499 | for (int in_i = 0; in_i < kernel_rows; in_i++) |
| 500 | { |
| 501 | const int i = base_i + in_i; |
| 502 | for (int in_j = 0; in_j < kernel_cols; in_j++) |
| 503 | { |
| 504 | const int j = base_j + in_j; |
| 505 | v[out_i][out_j] += w[in_i][in_j] * u[i][j]; |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | // Store the output tile |
| 512 | for (int i = 0; i < out_cells_i; i++) |
| 513 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 514 | TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 515 | for (int j = 0; j < out_cells_j; j++) |
| 516 | { |
| 517 | *(outptr_row + j*out_col_stride) = v[i][j]; |
| 518 | } |
| 519 | } |
| 520 | outptr_base++; |
| 521 | } |
| 522 | } |
| 523 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 524 | } // namespace depthwise |