Michele Di Giorgio | d556d7b | 2020-10-27 10:56:31 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2021 Arm Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #pragma once |
| 25 | |
| 26 | #include "pool_common.hpp" |
| 27 | |
| 28 | #include <stack> |
| 29 | #include <vector> |
| 30 | |
| 31 | namespace arm_conv { |
| 32 | namespace pooling { |
| 33 | |
| 34 | template <class strategy> |
| 35 | class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type> |
| 36 | { |
| 37 | using TInput = typename strategy::operand_type; |
| 38 | using TOutput = typename strategy::return_type; |
| 39 | |
| 40 | const PoolingArgs m_args; // Copy of arguments |
| 41 | |
| 42 | constexpr static unsigned int input_rows(void) |
| 43 | { |
| 44 | return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows(); |
| 45 | } |
| 46 | |
| 47 | constexpr static unsigned int input_cols(void) |
| 48 | { |
| 49 | return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols(); |
| 50 | } |
| 51 | |
| 52 | size_t sizeof_input_buffer(void) const |
| 53 | { |
| 54 | return sizeof(TInput) * m_args.n_channels; |
| 55 | } |
| 56 | |
| 57 | size_t sizeof_output_buffer(void) const |
| 58 | { |
| 59 | return sizeof(TOutput) * m_args.n_channels; |
| 60 | } |
| 61 | |
| 62 | public: |
| 63 | PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args) |
| 64 | { |
| 65 | } |
| 66 | |
| 67 | PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete; |
| 68 | PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete; |
| 69 | |
| 70 | size_t get_working_size(void) const override |
| 71 | { |
| 72 | // We require an array of pointers for the inputs and outputs, a |
| 73 | // channel-length vector in which to dump surplus output, and a |
| 74 | // channel-length vector of padding values. |
| 75 | return sizeof_input_buffer() + sizeof_output_buffer(); |
| 76 | } |
| 77 | |
| 78 | void execute( |
| 79 | const void *const input, |
| 80 | void *const output, |
| 81 | void *const working_space |
| 82 | ) const override |
| 83 | { |
| 84 | const size_t ld_input_col = m_args.n_channels; |
| 85 | const size_t ld_input_row = ld_input_col * m_args.input_cols; |
| 86 | const size_t ld_input_batch = ld_input_row * m_args.input_rows; |
| 87 | const size_t ld_output_col = ld_input_col; |
| 88 | const size_t ld_output_row = ld_output_col * m_args.output_cols; |
| 89 | const size_t ld_output_batch = ld_output_row * m_args.output_rows; |
| 90 | |
| 91 | execute( |
| 92 | input, ld_input_col, ld_input_row, ld_input_batch, |
| 93 | output, ld_output_col, ld_output_row, ld_output_batch, |
| 94 | working_space |
| 95 | ); |
| 96 | } |
| 97 | |
| 98 | void execute( |
| 99 | const void *const input, |
| 100 | size_t ld_input_col, |
| 101 | size_t ld_input_row, |
| 102 | size_t ld_input_batch, |
| 103 | void *const output, |
| 104 | size_t ld_output_col, |
| 105 | size_t ld_output_row, |
| 106 | size_t ld_output_batch, |
| 107 | void *const working_space |
| 108 | ) const override |
| 109 | { |
| 110 | execute( |
| 111 | m_args.n_batches, m_args.input_rows, m_args.input_cols, |
| 112 | m_args.n_channels, |
| 113 | input, ld_input_col, ld_input_row, ld_input_batch, |
| 114 | m_args.padding, |
| 115 | m_args.output_rows, m_args.output_cols, |
| 116 | output, ld_output_col, ld_output_row, ld_output_batch, |
| 117 | working_space |
| 118 | ); |
| 119 | } |
| 120 | |
| 121 | void execute( |
| 122 | unsigned int batches, |
| 123 | unsigned int input_height, |
| 124 | unsigned int input_width, |
| 125 | unsigned int channels, |
| 126 | const void *const _input, |
| 127 | size_t ld_input_col, |
| 128 | size_t ld_input_row, |
| 129 | size_t ld_input_batch, |
| 130 | const PaddingValues &padding, |
| 131 | unsigned int output_height, |
| 132 | unsigned int output_width, |
| 133 | void *const _output, |
| 134 | size_t ld_output_col, |
| 135 | size_t ld_output_row, |
| 136 | size_t ld_output_batch, |
| 137 | void *const _working_space |
| 138 | ) const override |
| 139 | { |
| 140 | strategy strat(m_args.cpu_info); |
| 141 | #ifdef CYCLE_PROFILING |
| 142 | arm_gemm::profiler prof; |
| 143 | #endif // CYCLE_PROFILING |
| 144 | |
| 145 | // Cast input and output pointers into the right types |
| 146 | const TInput *const inptr = static_cast<const TInput *>(_input); |
| 147 | TOutput *const outptr = static_cast<TOutput *>(_output); |
| 148 | |
| 149 | // Allocate portions of the working space |
| 150 | uint8_t *const working_space = static_cast<uint8_t *>(_working_space); |
| 151 | TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space); |
| 152 | TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer()); |
| 153 | |
| 154 | // Fill the input buffer |
| 155 | const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE) |
| 156 | ? static_cast<TInput>(0) |
| 157 | : (std::numeric_limits<TInput>::has_infinity |
| 158 | ? -std::numeric_limits<TInput>::infinity() |
| 159 | : std::numeric_limits<TInput>::lowest()); |
| 160 | for (unsigned int i = 0; i < channels; i++) |
| 161 | { |
| 162 | input_buffer[i] = pad_value; |
| 163 | } |
| 164 | |
| 165 | // Keep subdividing the output plane across the longest dimension until we |
| 166 | // reach the size of the tile. Queue items for later processing. Note - we |
| 167 | // can determine the largest size of the queue a priori from the input |
| 168 | // tensor size, this would allow us to allocate memory within the working |
| 169 | // space and improve performance. |
| 170 | struct WorkItem |
| 171 | { |
| 172 | unsigned int output_i, output_j; |
| 173 | unsigned int output_height, output_width; |
| 174 | |
| 175 | WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width) |
| 176 | : output_i(i), output_j(j), output_height(height), output_width(width) {} |
| 177 | }; |
| 178 | |
| 179 | auto execute = [&] (const WorkItem &item) { |
| 180 | // Create an array for the output pointers |
| 181 | TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()]; |
| 182 | TOutput **const outptr_array = _outptr_array; |
| 183 | |
| 184 | // Construct the output pointer array |
| 185 | { |
| 186 | const auto output_pad_right = strategy::out_rows() - item.output_width; |
| 187 | auto outptr_element = outptr_array; |
| 188 | auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col; |
| 189 | |
| 190 | // Fill the array with pointers to the output buffer |
| 191 | for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++) |
| 192 | { |
| 193 | outptr_array[i] = output_buffer; |
| 194 | } |
| 195 | |
| 196 | // Fill in the valid portion of the array |
| 197 | for (unsigned int i = 0; i < item.output_height; i++) |
| 198 | { |
| 199 | auto outptr_col = outptr_row; |
| 200 | for (unsigned int j = 0; j < item.output_width; j++) |
| 201 | { |
| 202 | *(outptr_element++) = outptr_col; |
| 203 | outptr_col += ld_output_col; |
| 204 | } |
| 205 | outptr_element += output_pad_right; |
| 206 | outptr_row += ld_output_row; |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | const int start_i = item.output_i * strategy::stride_rows() - padding.top; |
| 211 | const int end_i = start_i + input_rows(); |
| 212 | const unsigned int pad_top = std::max(0, 0 - start_i); |
| 213 | const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height)); |
| 214 | |
| 215 | const int start_j = item.output_j * strategy::stride_cols() - padding.left; |
| 216 | const int end_j = start_j + input_cols(); |
| 217 | const unsigned int pad_left = std::max(0, 0 - start_j); |
| 218 | const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width)); |
| 219 | |
| 220 | // Create an array for the input pointers |
| 221 | const TInput * _inptr_array[input_rows() * input_cols()]; |
| 222 | const TInput **const inptr_array = _inptr_array; |
| 223 | { |
| 224 | const unsigned int row_padding = pad_top + pad_bottom; |
| 225 | const unsigned int valid_rows = input_rows() - row_padding; |
| 226 | |
| 227 | const unsigned int col_padding = pad_left + pad_right; |
| 228 | const unsigned int valid_cols = input_cols() - col_padding; |
| 229 | |
| 230 | // Fill the array with pointers to the input buffer |
| 231 | for (unsigned int i = 0; i < input_rows() * input_cols(); i++) |
| 232 | { |
| 233 | inptr_array[i] = input_buffer; |
| 234 | } |
| 235 | |
| 236 | // Compute valid initial pointer |
| 237 | auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col; |
| 238 | |
| 239 | // Fill in the valid portion of the input array |
| 240 | auto inptr_element = inptr_array + pad_top * input_cols() + pad_left; |
| 241 | for (unsigned int i = 0; i < valid_rows; i++) |
| 242 | { |
| 243 | auto inptr_col = inptr_row; |
| 244 | for (unsigned int j = 0; j < valid_cols; j++) |
| 245 | { |
| 246 | *(inptr_element++) = inptr_col; |
| 247 | inptr_col += ld_input_col; |
| 248 | } |
| 249 | |
| 250 | inptr_row += ld_input_row; |
| 251 | inptr_element += col_padding; // Skip the padding elements |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | // Call the kernel |
| 256 | #ifdef CYCLE_PROFILING |
| 257 | // TODO Work number |
| 258 | auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols())); |
| 259 | #endif // CYCLE_PROFILING |
| 260 | strat.kernel(channels, inptr_array, outptr_array, |
| 261 | pad_left, pad_top, pad_right, pad_bottom); |
| 262 | }; |
| 263 | |
| 264 | // Add the initial work item to the stack of work. |
| 265 | std::stack<WorkItem, std::vector<WorkItem>> stack; |
| 266 | stack.push(WorkItem(0, 0, output_height, output_width)); |
| 267 | while (!stack.empty()) |
| 268 | { |
| 269 | // Pop an item from the stack, bisect the largest dimension and either |
| 270 | // execute the resulting tiles or add them to the stack if they are too |
| 271 | // large. |
| 272 | const WorkItem item(stack.top()); |
| 273 | stack.pop(); |
| 274 | |
| 275 | if (item.output_height <= strategy::out_rows() && |
| 276 | item.output_width <= strategy::out_cols()) |
| 277 | { |
| 278 | execute(item); |
| 279 | } |
| 280 | else |
| 281 | { |
| 282 | // Split the largest dimension, such that we get an exact number of |
| 283 | // tiles in the first partition. |
| 284 | if (item.output_height >= item.output_width) |
| 285 | { |
| 286 | const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows(); |
| 287 | const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2; |
| 288 | |
| 289 | const unsigned int height_first = tiles_first * strategy::out_rows(); |
| 290 | const unsigned int height_second = item.output_height - height_first; |
| 291 | |
| 292 | stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width)); |
| 293 | stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width)); |
| 294 | } |
| 295 | else |
| 296 | { |
| 297 | const unsigned int width_in_tiles = item.output_width / strategy::out_cols(); |
| 298 | const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2; |
| 299 | |
| 300 | const unsigned int width_first = tiles_first * strategy::out_cols(); |
| 301 | const unsigned int width_second = item.output_width - width_first; |
| 302 | |
| 303 | stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second)); |
| 304 | stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first)); |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | } |
| 309 | }; |
| 310 | |
| 311 | } // namespace pooling |
| 312 | } // namespace arm_conv |