Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | /* |
| 26 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 27 | * |
| 28 | * NOTE: Header to be included by implementation files only. |
| 29 | * |
| 30 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 31 | */ |
| 32 | |
| 33 | #include <algorithm> |
| 34 | #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" |
| 35 | #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" |
| 36 | |
| 37 | #pragma once |
| 38 | |
| 39 | namespace depthwise |
| 40 | { |
| 41 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 42 | const unsigned int CHANNEL_BLOCK = 16; |
| 43 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 44 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 45 | int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size( |
| 46 | const int dim_size, const bool same_padding |
| 47 | ) |
| 48 | { |
| 49 | return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR); |
| 50 | } |
| 51 | |
| 52 | |
| 53 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 54 | DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( |
| 55 | const int n_batches, const int n_input_rows, const int n_input_cols, |
| 56 | const int n_channels, const bool padding_same, |
| 57 | const TIn* const weights, |
| 58 | const TIn* const input, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 59 | TOut* const output, |
| 60 | const int weight_col_stride, |
| 61 | const int weight_row_stride, |
| 62 | const int input_col_stride, |
| 63 | const int input_row_stride, |
| 64 | const int input_batch_stride, |
| 65 | const int output_col_stride, |
| 66 | const int output_row_stride, |
| 67 | const int output_batch_stride |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 68 | ) : _weights(weights), _input(input), _output(output), |
| 69 | _n_batches(n_batches), |
| 70 | _n_input_rows(n_input_rows), |
| 71 | _n_input_cols(n_input_cols), |
| 72 | _n_channels(n_channels), |
| 73 | _n_output_rows(get_output_size(n_input_rows, padding_same)), |
| 74 | _n_output_cols(get_output_size(n_input_cols, padding_same)), |
| 75 | _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)), |
| 76 | _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)), |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 77 | _padding_same(padding_same), |
| 78 | _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels), |
| 79 | _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride), |
| 80 | _input_col_stride(input_col_stride ? input_col_stride : _n_channels), |
| 81 | _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride), |
| 82 | _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), |
| 83 | _output_col_stride(output_col_stride ? output_col_stride : _n_channels), |
| 84 | _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), |
| 85 | _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride) |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 86 | { |
| 87 | } |
| 88 | |
| 89 | |
| 90 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 91 | unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const |
| 92 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 93 | // Parallelise over blocks of channels. |
| 94 | return iceildiv(_n_channels, CHANNEL_BLOCK); |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 95 | } |
| 96 | |
| 97 | |
| 98 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 99 | void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run( |
| 100 | const unsigned int start, |
| 101 | const unsigned int stop |
| 102 | ) |
| 103 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 104 | // Parallelise over blocks of channels |
| 105 | const auto start_channel = CHANNEL_BLOCK * start; |
| 106 | const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop); |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 107 | |
| 108 | // Compute top and bottom padding for input and output |
| 109 | const int input_pad_top = _padding_same ? |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 110 | ((_n_output_rows - 1)*stride_rows + kernel_rows - _n_input_rows) / 2 : 0; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 111 | const int input_pad_left = _padding_same ? |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 112 | ((_n_output_cols - 1)*stride_cols + kernel_cols - _n_input_cols) / 2 : 0; |
| 113 | constexpr int tile_overlap = kernel_rows - stride_rows; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 114 | |
| 115 | // Perform the convolution by calling `process_tile_row` for each tile row in |
| 116 | // each batch. |
| 117 | for (int batch = 0; batch < _n_batches; batch++) |
| 118 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 119 | const TIn* const inptr_batch = _input + batch*_input_batch_stride; |
| 120 | TOut* const outptr_batch = _output + batch*_output_batch_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 121 | |
| 122 | // Loop over rows of tiles |
| 123 | for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++) |
| 124 | { |
| 125 | // Pointer to the row |
| 126 | const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top; |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 127 | const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride); |
| 128 | TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 129 | |
| 130 | // Input padding (top + bottom) for the row |
| 131 | const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top; |
| 132 | const int input_row_bottom = input_row_top + inner_tile_rows; |
| 133 | const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0; |
| 134 | const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows); |
| 135 | |
| 136 | // Output padding (bottom) for the row |
| 137 | const int output_row_bottom = (tile_i + 1)*output_tile_rows; |
| 138 | const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); |
| 139 | |
| 140 | // Process the row |
| 141 | process_tile_row( |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 142 | stop_channel - start_channel, |
| 143 | _weights + start_channel, _weight_row_stride, _weight_col_stride, |
| 144 | inptr_row + start_channel, _input_row_stride, _input_col_stride, |
| 145 | outptr_row + start_channel, _output_row_stride, _output_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 146 | input_row_pad_top, input_pad_left, input_row_pad_bottom, |
| 147 | output_row_pad_bottom, |
| 148 | _n_tile_cols, _n_input_cols, _n_output_cols |
| 149 | ); |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | |
| 155 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 156 | void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row( |
| 157 | const int n_channels, |
| 158 | const TIn* const weights, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 159 | const int weight_row_stride, |
| 160 | const int weight_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 161 | const TIn* const inptr, |
| 162 | const int in_row_stride, |
| 163 | const int in_col_stride, |
| 164 | TOut* const outptr, |
| 165 | const int out_row_stride, |
| 166 | const int out_col_stride, |
| 167 | const int row_pad_in_top, |
| 168 | const int row_pad_in_left, |
| 169 | const int row_pad_in_bottom, |
| 170 | const int row_pad_out_bottom, |
| 171 | const int n_tiles, |
| 172 | const int n_input_cols, |
| 173 | const int n_output_cols |
| 174 | ) |
| 175 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 176 | constexpr int tile_overlap = kernel_cols - stride_cols; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 177 | |
| 178 | // Loop over columns of tiles |
| 179 | for (int tile_j = 0; tile_j < n_tiles; tile_j++) |
| 180 | { |
| 181 | // Input padding (left + right) for the tile |
| 182 | const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0; |
| 183 | const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left; |
| 184 | const int t_in_end = t_in_start + inner_tile_cols; |
| 185 | const int t_pad_in_right = std::max(0, t_in_end - n_input_cols); |
| 186 | |
| 187 | // Output padding (right) for the tile |
| 188 | const int t_out_end = (tile_j + 1) * output_tile_cols; |
| 189 | const int t_pad_out_right = std::max(0, t_out_end - n_output_cols); |
| 190 | |
| 191 | // Get pointers into the inputs and outputs |
| 192 | const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; |
| 193 | const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride); |
| 194 | TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride; |
| 195 | |
| 196 | // Apply the specific tile processing function |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 197 | const bool pad_top = row_pad_in_top > 0; |
| 198 | const bool pad_left = t_pad_in_left > 0; |
| 199 | const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom; |
| 200 | const bool pad_right = t_pad_in_right || t_pad_out_right; |
| 201 | |
| 202 | const TileFn tilefn = [&] () { |
| 203 | if (!pad_top && !pad_left && !pad_bottom && !pad_right) |
| 204 | { |
| 205 | // No padding |
| 206 | return tilefn_unpadded; |
| 207 | } |
| 208 | else if (pad_top && !pad_left && !pad_bottom && !pad_right) |
| 209 | { |
| 210 | // Padding on the top only, subtract off the minimum expected padding in |
| 211 | // order to index into the array of specialised methods. |
| 212 | const int index = row_pad_in_top - min_in_pad_top; |
| 213 | return tilefn_top[index]; |
| 214 | } |
| 215 | else if (!pad_top && pad_left && !pad_bottom && !pad_right) |
| 216 | { |
| 217 | // Padding on the left only, subtract off the minimum expected padding in |
| 218 | // order to index into the array of specialised methods. |
| 219 | const int index = t_pad_in_left - min_in_pad_left; |
| 220 | return tilefn_left[index]; |
| 221 | } |
| 222 | else if (!pad_top && !pad_left && pad_bottom && !pad_right) |
| 223 | { |
| 224 | // Padding on the bottom only |
| 225 | return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom]; |
| 226 | } |
| 227 | else if (!pad_top && !pad_left && !pad_bottom && pad_right) |
| 228 | { |
| 229 | // Padding on the right only |
| 230 | return tilefn_right[t_pad_in_right][t_pad_out_right]; |
| 231 | } |
| 232 | else |
| 233 | { |
| 234 | // Otherwise use generic tile processing method. |
| 235 | return tilefn_generic; |
| 236 | } |
| 237 | }(); |
| 238 | |
| 239 | tilefn( |
| 240 | n_channels, |
| 241 | weights, weight_row_stride, weight_col_stride, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 242 | inptr_col, in_row_stride, in_col_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 243 | outptr_col, out_row_stride, out_col_stride, |
| 244 | row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, |
| 245 | row_pad_out_bottom, t_pad_out_right |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 246 | ); |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 251 | // New templated struct used solely as a way to provide tile processing |
| 252 | // specialisations. |
| 253 | template <int OutputTileRows, int OutputTileCols, |
| 254 | int KernelRows, int KernelCols, |
| 255 | int StrideRows, int StrideCols, |
| 256 | typename TIn, typename TOut> |
| 257 | struct DepthwiseConvolutionImpl : public DepthwiseConvolution< |
| 258 | OutputTileRows, OutputTileCols, |
| 259 | KernelRows, KernelCols, |
| 260 | StrideRows, StrideCols, TIn, TOut |
| 261 | > |
| 262 | { |
| 263 | typedef DepthwiseConvolution< |
| 264 | OutputTileRows, OutputTileCols, |
| 265 | KernelRows, KernelCols, |
| 266 | StrideRows, StrideCols, |
| 267 | TIn, TOut |
| 268 | > DWC; |
| 269 | |
| 270 | /** Perform the depthwise convolution of a tile. |
| 271 | * |
| 272 | * @param[in] n_channels Number of channels. |
| 273 | * @param[in] weights Pointer to Height x Width x Channels ordered weights. |
| 274 | * @param[in] inptr Pointer to the top-left unpadded value of the tile. |
| 275 | * @param[in] in_row_stride Stride between rows of the input tensor. |
| 276 | * @param[in] in_col_stride Stride between columns of the input tensor. |
| 277 | * @param[out] outptr Pointer to the top-left output value for the tile. |
| 278 | * @param[in] out_row_stride Stride between rows of the output tensor. |
| 279 | * @param[in] out_col_stride Stride between columns of the output tensor. |
| 280 | * |
| 281 | * The following parameters may be ignored if the function has been |
| 282 | * specialised for specific padding constraints. |
| 283 | * |
| 284 | * @param[in] _in_pad_top Padding to apply to top of input tile. |
| 285 | * @param[in] _in_pad_left Padding to apply to left of input tile. |
| 286 | * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. |
| 287 | * @param[in] _in_pad_right Padding to apply to right of input tile. |
| 288 | * @param[in] _out_pad_bottom Null cells at bottom of output tile. |
| 289 | * @param[in] _out_pad_right Null cells at right of output tile. |
| 290 | */ |
| 291 | template < |
| 292 | bool Specialize=false, // Specialize (or not) the method |
| 293 | int InPadTop=0, // If specialized, top padding |
| 294 | int InPadLeft=0, // If specialized, left padding |
| 295 | int InPadBottom=0, // If specialized, bottom padding |
| 296 | int InPadRight=0, // If specialized, right padding |
| 297 | int OutPadBottom=0, // If specialized, bottom output padding |
| 298 | int OutPadRight=0 // If specialized, bottom right padding |
| 299 | > |
| 300 | static void process_tile( |
| 301 | const int n_channels, |
| 302 | const TIn* const weights, |
| 303 | const int weight_row_stride, |
| 304 | const int weight_col_stride, |
| 305 | const TIn* const inptr, |
| 306 | const int in_row_stride, |
| 307 | const int in_col_stride, |
| 308 | TOut* const outptr, |
| 309 | const int out_row_stride, |
| 310 | const int out_col_stride, |
| 311 | const int in_pad_top=0, |
| 312 | const int in_pad_left=0, |
| 313 | const int in_pad_bottom=0, |
| 314 | const int in_pad_right=0, |
| 315 | const int out_pad_bottom=0, |
| 316 | const int out_pad_right=0 |
| 317 | ); |
| 318 | }; |
| 319 | |
| 320 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 321 | template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> |
| 322 | template < |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 323 | bool Specialize, |
| 324 | int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, |
| 325 | int OutPadBottom, int OutPadRight |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 326 | > |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 327 | void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile( |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 328 | const int n_channels, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 329 | const TIn *__restrict__ const weights, |
| 330 | const int weight_row_stride, |
| 331 | const int weight_col_stride, |
| 332 | const TIn *__restrict__ const inptr, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 333 | const int in_row_stride, |
| 334 | const int in_col_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 335 | TOut *__restrict__ const outptr, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 336 | const int out_row_stride, |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 337 | const int out_col_stride, |
| 338 | const int _in_pad_top, |
| 339 | const int _in_pad_left, |
| 340 | const int _in_pad_bottom, |
| 341 | const int _in_pad_right, |
| 342 | const int _out_pad_bottom, |
| 343 | const int _out_pad_right |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 344 | ) |
| 345 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 346 | constexpr auto inner_tile_rows = DWC::inner_tile_rows; |
| 347 | constexpr auto inner_tile_cols = DWC::inner_tile_cols; |
| 348 | constexpr auto kernel_rows = DWC::kernel_rows; |
| 349 | constexpr auto kernel_cols = DWC::kernel_cols; |
| 350 | constexpr auto output_tile_rows = DWC::output_tile_rows; |
| 351 | constexpr auto output_tile_cols = DWC::output_tile_cols; |
| 352 | constexpr auto stride_rows = DWC::stride_rows; |
| 353 | constexpr auto stride_cols = DWC::stride_cols; |
| 354 | |
| 355 | // Extract parameters |
| 356 | const int in_pad_top = Specialize ? InPadTop : _in_pad_top; |
| 357 | const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; |
| 358 | const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; |
| 359 | const int in_pad_right = Specialize ? InPadRight : _in_pad_right; |
| 360 | const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; |
| 361 | const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; |
| 362 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 363 | // Compute valid ranges of the tile |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 364 | const int in_cells_i = inner_tile_rows - in_pad_bottom; |
| 365 | const int in_cells_j = inner_tile_cols - in_pad_right; |
| 366 | const int out_cells_i = output_tile_rows - out_pad_bottom; |
| 367 | const int out_cells_j = output_tile_cols - out_pad_right; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 368 | |
| 369 | // Instantiate pointers |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 370 | const TIn* __restrict__ inptr_base = inptr; |
| 371 | const TIn* __restrict__ wptr_base = weights; |
| 372 | TOut* __restrict__ outptr_base = outptr; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 373 | |
| 374 | // Perform the depthwise convolution |
| 375 | int channels_remaining = n_channels; |
| 376 | for (; channels_remaining; channels_remaining--) |
| 377 | { |
| 378 | // Load input tile |
| 379 | TIn u[inner_tile_rows][inner_tile_cols]; |
| 380 | for (int i = 0; i < inner_tile_rows; i++) |
| 381 | { |
| 382 | const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; |
| 383 | for (int j = 0; j < inner_tile_cols; j++) |
| 384 | { |
| 385 | if (i < in_pad_top || in_cells_i <= i || |
| 386 | j < in_pad_left || in_cells_j <= j) |
| 387 | { |
| 388 | u[i][j] = static_cast<TIn>(0); |
| 389 | } |
| 390 | else |
| 391 | { |
| 392 | u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride); |
| 393 | } |
| 394 | } |
| 395 | } |
| 396 | inptr_base++; |
| 397 | |
| 398 | // Load weights tile |
| 399 | TIn w[kernel_rows][kernel_cols]; |
| 400 | for (int i = 0; i < kernel_rows; i++) |
| 401 | { |
| 402 | const TIn* const wptr_row = wptr_base + i*weight_row_stride; |
| 403 | for (int j = 0; j < kernel_cols; j++) |
| 404 | { |
| 405 | w[i][j] = *(wptr_row + j*weight_col_stride); |
| 406 | } |
| 407 | } |
| 408 | wptr_base++; |
| 409 | |
| 410 | // Perform the convolution |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 411 | TOut v[output_tile_rows][output_tile_cols]; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 412 | for (int out_i = 0; out_i < out_cells_i; out_i++) |
| 413 | { |
| 414 | for (int out_j = 0; out_j < out_cells_j; out_j++) |
| 415 | { |
| 416 | // Clear the accumulator |
| 417 | v[out_i][out_j] = static_cast<TOut>(0); |
| 418 | |
| 419 | // Base co-ordinate |
| 420 | const int base_i = out_i * stride_rows; |
| 421 | const int base_j = out_j * stride_cols; |
| 422 | |
| 423 | // Fill the accumulator |
| 424 | for (int in_i = 0; in_i < kernel_rows; in_i++) |
| 425 | { |
| 426 | const int i = base_i + in_i; |
| 427 | for (int in_j = 0; in_j < kernel_cols; in_j++) |
| 428 | { |
| 429 | const int j = base_j + in_j; |
| 430 | v[out_i][out_j] += w[in_i][in_j] * u[i][j]; |
| 431 | } |
| 432 | } |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | // Store the output tile |
| 437 | for (int i = 0; i < out_cells_i; i++) |
| 438 | { |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 439 | TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 440 | for (int j = 0; j < out_cells_j; j++) |
| 441 | { |
| 442 | *(outptr_row + j*out_col_stride) = v[i][j]; |
| 443 | } |
| 444 | } |
| 445 | outptr_base++; |
| 446 | } |
| 447 | } |
| 448 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 449 | } // namespace depthwise |