ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 1 | /* |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 2 | * Copyright (c) 2022-2023 Arm Limited. |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | |
| 25 | #pragma once |
| 26 | |
Michael Tyler | 74921ee | 2023-04-12 17:43:17 +0100 | [diff] [blame] | 27 | #include "depthwise.hpp" |
| 28 | #include "utils.hpp" |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 29 | |
| 30 | namespace arm_conv { |
| 31 | namespace depthwise { |
| 32 | |
| 33 | template <typename T> struct DefaultTAccum { using Type = T; }; |
| 34 | template <> struct DefaultTAccum<int8_t> { using Type = int32_t; }; |
| 35 | template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; }; |
| 36 | |
| 37 | template <typename T> struct DefaultOutputStage { using Type = Nothing; }; |
| 38 | template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; }; |
| 39 | template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; }; |
| 40 | |
| 41 | class IDepthfirstStrategy |
| 42 | { |
| 43 | public: |
| 44 | virtual ~IDepthfirstStrategy() = default; |
| 45 | |
| 46 | virtual unsigned int get_input_rows() const = 0; |
| 47 | virtual unsigned int get_input_cols() const = 0; |
| 48 | |
| 49 | virtual unsigned int get_output_rows() const = 0; |
| 50 | virtual unsigned int get_output_cols() const = 0; |
| 51 | }; |
| 52 | |
| 53 | |
| 54 | template <typename T> |
| 55 | struct TensorSpec |
| 56 | { |
| 57 | T base; |
| 58 | size_t ld_row, ld_col; |
| 59 | |
| 60 | TensorSpec(T ptr, size_t ld_row, size_t ld_col) |
| 61 | : base(ptr), ld_row(ld_row), ld_col(ld_col) {} |
| 62 | }; |
| 63 | |
| 64 | |
| 65 | template <typename TInput, typename TWeight, typename TOutput> |
| 66 | class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> |
| 67 | { |
| 68 | protected: |
| 69 | using Parent = DepthwiseCommon<TInput, TWeight, TOutput>; |
| 70 | |
| 71 | // The strategy which we're applying to solve the depthwise convolution. |
| 72 | std::unique_ptr<const IDepthfirstStrategy> m_strat; |
| 73 | |
| 74 | /* Compute the amount of working space required for a single thread. */ |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 75 | virtual size_t get_working_size_per_thread() const = 0; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 76 | |
| 77 | /* Initialise the working space for a thread. */ |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 78 | virtual void initialise_working_space(void *) const = 0; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 79 | |
| 80 | /* Compute a portion of the output tensor with padding. */ |
| 81 | virtual void compute_tile_padded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 82 | const DepthwiseArgs &args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 83 | unsigned int output_i, unsigned int output_j, |
| 84 | unsigned int output_channel_start, unsigned int output_channel_end, |
| 85 | const TensorSpec<const TInput *> &input, |
| 86 | const TensorSpec<TOutput *> &output, |
| 87 | const void *parameters, |
| 88 | void *working_space |
| 89 | ) const = 0; |
| 90 | |
| 91 | /* Compute a portion of the work with only top/bottom padding. |
| 92 | * |
| 93 | * The default implementation of this repeatedly calls into the padded tile |
| 94 | * variant. |
| 95 | */ |
| 96 | virtual void compute_row_padded_tile_row( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 97 | const DepthwiseArgs &args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 98 | const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, |
| 99 | const unsigned int output_channel_start, const unsigned int output_channel_end, |
| 100 | const TensorSpec<const TInput *> &input, |
| 101 | const TensorSpec<TOutput *> &output, |
| 102 | const void *parameters, |
| 103 | void *working_space |
| 104 | ) const |
| 105 | { |
| 106 | for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols()) |
| 107 | { |
| 108 | this->compute_tile_padded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 109 | args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 110 | output_i, output_j, output_channel_start, output_channel_end, |
| 111 | input, output, parameters, working_space |
| 112 | ); |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | /* Compute a portion of the output tensor with no padding. |
| 117 | * |
| 118 | * The default implementation of this repeatedly calls into the padded |
| 119 | * variant. |
| 120 | */ |
| 121 | virtual void compute_tiles_unpadded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 122 | const DepthwiseArgs &args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 123 | unsigned int start_output_i, unsigned int start_output_j, |
| 124 | unsigned int n_tile_rows, unsigned int n_tile_cols, |
| 125 | unsigned int output_channel_start, unsigned int output_channel_end, |
| 126 | const TensorSpec<const TInput *> &input, |
| 127 | const TensorSpec<TOutput *> &output, |
| 128 | const void *parameters, |
| 129 | void *working_space |
| 130 | ) const |
| 131 | { |
| 132 | for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) |
| 133 | { |
| 134 | unsigned int row_start_output_j = start_output_j; |
| 135 | for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) |
| 136 | { |
| 137 | this->compute_tile_padded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 138 | args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 139 | start_output_i, row_start_output_j, |
| 140 | output_channel_start, output_channel_end, |
| 141 | input, output, parameters, working_space |
| 142 | ); |
| 143 | row_start_output_j += m_strat->get_output_cols(); |
| 144 | } |
| 145 | start_output_i += m_strat->get_output_rows(); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | void execute_internal( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 150 | const DepthwiseArgs &args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 151 | const void *input, |
| 152 | size_t ld_input_col, |
| 153 | size_t ld_input_row, |
| 154 | size_t ld_input_batch, |
| 155 | const void *parameters, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 156 | void *output, |
| 157 | size_t ld_output_col, |
| 158 | size_t ld_output_row, |
| 159 | size_t ld_output_batch, |
| 160 | void *working_space, |
| 161 | unsigned int thread_id, |
| 162 | unsigned int n_threads |
| 163 | ) const override |
| 164 | { |
| 165 | // Get and initialise the working space for this thread. |
| 166 | void *thread_working_space = |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 167 | static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(); |
| 168 | this->initialise_working_space(thread_working_space); |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 169 | |
| 170 | // Construct convenient representations of the input/output tensors. |
| 171 | TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col); |
| 172 | TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col); |
| 173 | |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 174 | const auto n_output_channels = args.input_channels * args.channel_multiplier; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 175 | |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 176 | for (unsigned int batch = 0; batch < args.n_batches; batch++) |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 177 | { |
| 178 | // Iterate over rows of the output tensor; we stripe over the tiles. |
| 179 | for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 180 | start_output_i < args.output_rows; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 181 | start_output_i += n_threads * m_strat->get_output_rows()) |
| 182 | { |
| 183 | // Determine what (if any padding) is required on the top/bottom of |
| 184 | // this row of the convolution. |
| 185 | const auto end_output_i = start_output_i + m_strat->get_output_rows(); |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 186 | const bool pad_output_bottom = args.output_rows < end_output_i; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 187 | |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 188 | const int start_input_i = start_output_i * args.stride_rows - args.padding.top; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 189 | const bool pad_input_top = start_input_i < 0; |
| 190 | const int end_input_i = start_input_i + m_strat->get_input_rows(); |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 191 | const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i; |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 192 | // We only need to account for input padding if direct padding is not supported. |
| 193 | const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding()) |
| 194 | || pad_output_bottom; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 195 | |
| 196 | // Iterate over the columns of the output tensor; we attempt to grab as |
| 197 | // much as possible of the unpadded regions, so the loop structure is a |
| 198 | // bit odd. |
| 199 | unsigned int start_output_j = 0; |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 200 | while (start_output_j < args.output_cols) |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 201 | { |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 202 | const int start_in_j = start_output_j * args.stride_cols - args.padding.left; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 203 | const bool pad_input_left = start_in_j < 0; |
| 204 | |
| 205 | // Determine if we can process a number of unpadded tiles in one go. |
| 206 | int n_unpadded_tiles = 0; |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 207 | if ((!pad_input_left) || this->supports_direct_padding()) |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 208 | { |
| 209 | // Determine the maximum number of tiles we could handle. |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 210 | n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols(); |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 211 | |
| 212 | // Handle padding on the right hand edge |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 213 | const int tile_stride = m_strat->get_output_cols() * args.stride_cols; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 214 | int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols(); |
| 215 | int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride; |
| 216 | |
| 217 | while (n_unpadded_tiles > 0 && |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 218 | (static_cast<int>(args.output_cols) < end_output_j || |
| 219 | static_cast<int>(args.input_cols) < end_input_j)) |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 220 | { |
| 221 | n_unpadded_tiles--; |
| 222 | end_output_j -= m_strat->get_output_cols(); |
| 223 | end_input_j -= tile_stride; |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | // Process unpadded tiles, if possible, otherwise process a padded tile. |
| 228 | if (n_unpadded_tiles) |
| 229 | { |
| 230 | if (!pad_row) |
| 231 | { |
| 232 | // Completely unpadded execution |
| 233 | this->compute_tiles_unpadded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 234 | args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 235 | start_output_i, start_output_j, |
| 236 | 1, n_unpadded_tiles, // Compute a row of unpadded tiles |
| 237 | 0, n_output_channels, // Compute all channels |
| 238 | input_tensor, output_tensor, parameters, thread_working_space |
| 239 | ); |
| 240 | } |
| 241 | else |
| 242 | { |
| 243 | // Top/bottom padding only |
| 244 | this->compute_row_padded_tile_row( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 245 | args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 246 | start_output_i, start_output_j, n_unpadded_tiles, |
| 247 | 0, n_output_channels, // Compute all channels |
| 248 | input_tensor, output_tensor, parameters, thread_working_space |
| 249 | ); |
| 250 | } |
| 251 | start_output_j += n_unpadded_tiles * m_strat->get_output_cols(); |
| 252 | } |
| 253 | else |
| 254 | { |
| 255 | this->compute_tile_padded( |
Pablo Marquez Tello | 4e2bbbb | 2023-01-09 17:21:01 +0000 | [diff] [blame] | 256 | args, |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 257 | start_output_i, start_output_j, |
| 258 | 0, n_output_channels, // Compute all channels |
| 259 | input_tensor, output_tensor, parameters, thread_working_space |
| 260 | ); |
| 261 | start_output_j += m_strat->get_output_cols(); |
| 262 | } |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | // Progress the pointers for the next batch. |
| 267 | input_tensor.base += ld_input_batch; |
| 268 | output_tensor.base += ld_output_batch; |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | public: |
| 273 | DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args) |
| 274 | : Parent(args), m_strat(strategy) |
| 275 | { |
| 276 | } |
| 277 | |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 278 | size_t get_working_size(unsigned int n_threads) const override final |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 279 | { |
Michael Tyler | 8deee9b | 2023-06-30 11:26:05 +0100 | [diff] [blame] | 280 | return n_threads * this->get_working_size_per_thread(); |
| 281 | } |
| 282 | |
| 283 | virtual bool supports_direct_padding() const |
| 284 | { |
| 285 | return false; |
ramelg01 | 8a16488 | 2022-04-07 02:42:52 +0100 | [diff] [blame] | 286 | } |
| 287 | }; |
| 288 | |
| 289 | } // namespace depthwise |
| 290 | } // namespace arm_conv |