ramelg01 | c827e99 | 2022-04-08 03:52:28 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2022 Arm Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | #pragma once |
| 26 | |
| 27 | #include "pooling.hpp" |
| 28 | #include "src/core/NEON/kernels/arm_gemm/utils.hpp" |
| 29 | |
| 30 | namespace arm_conv { |
| 31 | namespace pooling { |
| 32 | |
| 33 | class IDepthfirstStrategy |
| 34 | { |
| 35 | public: |
| 36 | virtual ~IDepthfirstStrategy() = default; |
| 37 | |
| 38 | virtual unsigned int get_input_rows() const = 0; |
| 39 | virtual unsigned int get_input_cols() const = 0; |
| 40 | |
| 41 | virtual unsigned int get_output_rows() const = 0; |
| 42 | virtual unsigned int get_output_cols() const = 0; |
| 43 | }; |
| 44 | |
| 45 | |
| 46 | template <typename T> |
| 47 | struct TensorSpec |
| 48 | { |
| 49 | T base; |
| 50 | size_t ld_row, ld_col; |
| 51 | |
| 52 | TensorSpec(T ptr, size_t ld_row, size_t ld_col) |
| 53 | : base(ptr), ld_row(ld_row), ld_col(ld_col) {} |
| 54 | }; |
| 55 | |
| 56 | |
| 57 | template <typename TInput, typename TOutput> |
| 58 | class DepthfirstDriver : public PoolingCommon<TInput, TOutput> |
| 59 | { |
| 60 | protected: |
| 61 | using Parent = PoolingCommon<TInput, TOutput>; |
| 62 | |
| 63 | // The strategy which we're applying to solve the pooling problem. |
| 64 | std::unique_ptr<const IDepthfirstStrategy> m_strat; |
| 65 | |
| 66 | /* Compute the amount of working space required for a single thread. */ |
| 67 | virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0; |
| 68 | |
| 69 | /* Initialise the working space for a thread. */ |
| 70 | virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0; |
| 71 | |
| 72 | /* Compute a portion of the output tensor with padding. */ |
| 73 | virtual void compute_tile_padded( |
| 74 | unsigned int output_i, unsigned int output_j, |
| 75 | unsigned int output_channel_start, unsigned int output_channel_end, |
| 76 | const TensorSpec<const TInput *> &input, |
| 77 | const TensorSpec<TOutput *> &output, |
| 78 | void *working_space |
| 79 | ) const = 0; |
| 80 | |
| 81 | /* Compute a portion of the work with only top/bottom padding. |
| 82 | * |
| 83 | * The default implementation of this repeatedly calls into the padded tile |
| 84 | * variant. |
| 85 | */ |
| 86 | virtual void compute_row_padded_tile_row( |
| 87 | const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, |
| 88 | const unsigned int output_channel_start, const unsigned int output_channel_end, |
| 89 | const TensorSpec<const TInput *> &input, |
| 90 | const TensorSpec<TOutput *> &output, |
| 91 | void *working_space |
| 92 | ) const |
| 93 | { |
| 94 | for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols()) |
| 95 | { |
| 96 | this->compute_tile_padded( |
| 97 | output_i, output_j, output_channel_start, output_channel_end, |
| 98 | input, output, working_space |
| 99 | ); |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | /* Compute a portion of the output tensor with no padding. |
| 104 | * |
| 105 | * The default implementation of this repeatedly calls into the padded |
| 106 | * variant. |
| 107 | */ |
| 108 | virtual void compute_tiles_unpadded( |
| 109 | unsigned int start_output_i, unsigned int start_output_j, |
| 110 | unsigned int n_tile_rows, unsigned int n_tile_cols, |
| 111 | unsigned int output_channel_start, unsigned int output_channel_end, |
| 112 | const TensorSpec<const TInput *> &input, |
| 113 | const TensorSpec<TOutput *> &output, |
| 114 | void *working_space |
| 115 | ) const |
| 116 | { |
| 117 | for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) |
| 118 | { |
| 119 | this->compute_row_padded_tile_row( |
| 120 | start_output_i, start_output_j, n_tile_cols, |
| 121 | output_channel_start, output_channel_end, |
| 122 | input, output, working_space |
| 123 | ); |
| 124 | start_output_i += m_strat->get_output_rows(); |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | void execute_internal( |
| 129 | unsigned int n_batches, |
| 130 | unsigned int input_height, |
| 131 | unsigned int input_width, |
| 132 | unsigned int n_channels, |
| 133 | const PaddingValues &padding, |
| 134 | const void *input, |
| 135 | size_t ld_input_col, |
| 136 | size_t ld_input_row, |
| 137 | size_t ld_input_batch, |
| 138 | unsigned int output_height, |
| 139 | unsigned int output_width, |
| 140 | void *output, |
| 141 | size_t ld_output_col, |
| 142 | size_t ld_output_row, |
| 143 | size_t ld_output_batch, |
| 144 | void *working_space, |
| 145 | unsigned int thread_id, |
| 146 | unsigned int n_threads |
| 147 | ) const override |
| 148 | { |
| 149 | // Get and initialise the working space for this thread. |
| 150 | void *thread_working_space = |
| 151 | static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_channels); |
| 152 | this->initialise_working_space(thread_working_space, n_channels); |
| 153 | |
| 154 | // Construct convenient representations of the input/output tensors. |
| 155 | TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col); |
| 156 | TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col); |
| 157 | |
| 158 | // If the output is a 1x1 tensor, which commonly occurs at the end of a |
| 159 | // network, then we change the threading strategy to parallelise over |
| 160 | // channels rather than rows of the tensor. |
| 161 | if (n_threads > 1 && output_height == 1 && output_width == 1) |
| 162 | { |
| 163 | // Determine how many channels should be assigned to each thread, we |
| 164 | // round up first to ensure we get a reasonable spread across the |
| 165 | // threads. |
| 166 | const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads; |
| 167 | const auto start_channel = thread_id * channels_per_thread; |
| 168 | const auto end_channel = std::min(start_channel + channels_per_thread, n_channels); |
| 169 | |
| 170 | if (start_channel >= end_channel) |
| 171 | { |
| 172 | // This thread should move on if we have insufficient work to do. |
| 173 | return; |
| 174 | } |
| 175 | |
| 176 | for (; n_batches; n_batches--) |
| 177 | { |
| 178 | // We know we don't need to iterate over rows or columns here; so just |
| 179 | // execute the tile. |
| 180 | this->compute_tile_padded( |
| 181 | 0, 0, // Compute the only output point |
| 182 | start_channel, end_channel, |
| 183 | input_tensor, output_tensor, thread_working_space |
| 184 | ); |
| 185 | |
| 186 | // Progress the pointers for the next batch. |
| 187 | input_tensor.base += ld_input_batch; |
| 188 | output_tensor.base += ld_output_batch; |
| 189 | } |
| 190 | |
| 191 | // Exit here, since we've done all the work using the different strategy. |
| 192 | return; |
| 193 | } |
| 194 | |
| 195 | for (unsigned int batch = 0; batch < n_batches; batch++) |
| 196 | { |
| 197 | // Iterate over rows of the output tensor; we stripe over the tiles. |
| 198 | for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); |
| 199 | start_output_i < output_height; |
| 200 | start_output_i += n_threads * m_strat->get_output_rows()) |
| 201 | { |
| 202 | // Determine what (if any padding) is required on the top/bottom of |
| 203 | // this row of the convolution. |
| 204 | const auto end_output_i = start_output_i + m_strat->get_output_rows(); |
| 205 | const bool pad_output_bottom = output_height < end_output_i; |
| 206 | |
| 207 | const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top; |
| 208 | const bool pad_input_top = start_input_i < 0; |
| 209 | const int end_input_i = start_input_i + m_strat->get_input_rows(); |
| 210 | const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i; |
| 211 | const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom; |
| 212 | |
| 213 | // Iterate over the columns of the output tensor; we attempt to grab as |
| 214 | // much as possible of the unpadded regions, so the loop structure is a |
| 215 | // bit odd. |
| 216 | unsigned int start_output_j = 0; |
| 217 | while (start_output_j < output_width) |
| 218 | { |
| 219 | const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left; |
| 220 | const bool pad_input_left = start_in_j < 0; |
| 221 | |
| 222 | // Determine if we can process a number of unpadded tiles in one go. |
| 223 | int n_unpadded_tiles = 0; |
| 224 | if (!pad_input_left) |
| 225 | { |
| 226 | // Determine the maximum number of tiles we could handle. |
| 227 | n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols(); |
| 228 | |
| 229 | // Handle padding on the right hand edge |
| 230 | const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols; |
| 231 | int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols(); |
| 232 | int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride; |
| 233 | |
| 234 | while (n_unpadded_tiles > 0 && |
| 235 | (static_cast<int>(output_width) < end_output_j || |
| 236 | static_cast<int>(input_width) < end_input_j)) |
| 237 | { |
| 238 | n_unpadded_tiles--; |
| 239 | end_output_j -= m_strat->get_output_cols(); |
| 240 | end_input_j -= tile_stride; |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | // Process unpadded tiles, if possible, otherwise process a padded tile. |
| 245 | if (n_unpadded_tiles) |
| 246 | { |
| 247 | if (!pad_row) |
| 248 | { |
| 249 | // Completely unpadded execution |
| 250 | this->compute_tiles_unpadded( |
| 251 | start_output_i, start_output_j, |
| 252 | 1, n_unpadded_tiles, // Compute a row of unpadded tiles |
| 253 | 0, n_channels, // Compute all channels |
| 254 | input_tensor, output_tensor, thread_working_space |
| 255 | ); |
| 256 | } |
| 257 | else |
| 258 | { |
| 259 | // Top/bottom padding only |
| 260 | this->compute_row_padded_tile_row( |
| 261 | start_output_i, start_output_j, n_unpadded_tiles, |
| 262 | 0, n_channels, // Compute all channels |
| 263 | input_tensor, output_tensor, thread_working_space |
| 264 | ); |
| 265 | } |
| 266 | start_output_j += n_unpadded_tiles * m_strat->get_output_cols(); |
| 267 | } |
| 268 | else |
| 269 | { |
| 270 | this->compute_tile_padded( |
| 271 | start_output_i, start_output_j, |
| 272 | 0, n_channels, // Compute all channels |
| 273 | input_tensor, output_tensor, thread_working_space |
| 274 | ); |
| 275 | start_output_j += m_strat->get_output_cols(); |
| 276 | } |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | // Progress the pointers for the next batch. |
| 281 | input_tensor.base += ld_input_batch; |
| 282 | output_tensor.base += ld_output_batch; |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | public: |
| 287 | DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args) |
| 288 | : Parent(args), m_strat(strategy) |
| 289 | { |
| 290 | } |
| 291 | |
| 292 | size_t get_working_size(unsigned int n_threads) const override |
| 293 | { |
| 294 | return this->get_working_size(n_threads, this->m_args.n_channels); |
| 295 | } |
| 296 | |
| 297 | size_t get_working_size(unsigned int n_threads, unsigned int n_channels) const override final |
| 298 | { |
| 299 | return n_threads * this->get_working_size_per_thread(n_channels); |
| 300 | } |
| 301 | }; |
| 302 | |
| 303 | } // namespace pooling |
| 304 | } // namespace arm_conv |