blob: 8473fc0838a9cf67e90cb44e870c4d39f4775d89 [file] [log] [blame]
ramelg01c827e992022-04-08 03:52:28 +01001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
27#include "pooling.hpp"
28#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
29
30namespace arm_conv {
31namespace pooling {
32
33class IDepthfirstStrategy
34{
35 public:
36 virtual ~IDepthfirstStrategy() = default;
37
38 virtual unsigned int get_input_rows() const = 0;
39 virtual unsigned int get_input_cols() const = 0;
40
41 virtual unsigned int get_output_rows() const = 0;
42 virtual unsigned int get_output_cols() const = 0;
43};
44
45
46template <typename T>
47struct TensorSpec
48{
49 T base;
50 size_t ld_row, ld_col;
51
52 TensorSpec(T ptr, size_t ld_row, size_t ld_col)
53 : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
54};
55
56
57template <typename TInput, typename TOutput>
58class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
59{
60 protected:
61 using Parent = PoolingCommon<TInput, TOutput>;
62
63 // The strategy which we're applying to solve the pooling problem.
64 std::unique_ptr<const IDepthfirstStrategy> m_strat;
65
66 /* Compute the amount of working space required for a single thread. */
67 virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0;
68
69 /* Initialise the working space for a thread. */
70 virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0;
71
72 /* Compute a portion of the output tensor with padding. */
73 virtual void compute_tile_padded(
74 unsigned int output_i, unsigned int output_j,
75 unsigned int output_channel_start, unsigned int output_channel_end,
76 const TensorSpec<const TInput *> &input,
77 const TensorSpec<TOutput *> &output,
78 void *working_space
79 ) const = 0;
80
81 /* Compute a portion of the work with only top/bottom padding.
82 *
83 * The default implementation of this repeatedly calls into the padded tile
84 * variant.
85 */
86 virtual void compute_row_padded_tile_row(
87 const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
88 const unsigned int output_channel_start, const unsigned int output_channel_end,
89 const TensorSpec<const TInput *> &input,
90 const TensorSpec<TOutput *> &output,
91 void *working_space
92 ) const
93 {
94 for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
95 {
96 this->compute_tile_padded(
97 output_i, output_j, output_channel_start, output_channel_end,
98 input, output, working_space
99 );
100 }
101 }
102
103 /* Compute a portion of the output tensor with no padding.
104 *
105 * The default implementation of this repeatedly calls into the padded
106 * variant.
107 */
108 virtual void compute_tiles_unpadded(
109 unsigned int start_output_i, unsigned int start_output_j,
110 unsigned int n_tile_rows, unsigned int n_tile_cols,
111 unsigned int output_channel_start, unsigned int output_channel_end,
112 const TensorSpec<const TInput *> &input,
113 const TensorSpec<TOutput *> &output,
114 void *working_space
115 ) const
116 {
117 for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
118 {
119 this->compute_row_padded_tile_row(
120 start_output_i, start_output_j, n_tile_cols,
121 output_channel_start, output_channel_end,
122 input, output, working_space
123 );
124 start_output_i += m_strat->get_output_rows();
125 }
126 }
127
128 void execute_internal(
129 unsigned int n_batches,
130 unsigned int input_height,
131 unsigned int input_width,
132 unsigned int n_channels,
133 const PaddingValues &padding,
134 const void *input,
135 size_t ld_input_col,
136 size_t ld_input_row,
137 size_t ld_input_batch,
138 unsigned int output_height,
139 unsigned int output_width,
140 void *output,
141 size_t ld_output_col,
142 size_t ld_output_row,
143 size_t ld_output_batch,
144 void *working_space,
145 unsigned int thread_id,
146 unsigned int n_threads
147 ) const override
148 {
149 // Get and initialise the working space for this thread.
150 void *thread_working_space =
151 static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_channels);
152 this->initialise_working_space(thread_working_space, n_channels);
153
154 // Construct convenient representations of the input/output tensors.
155 TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
156 TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
157
158 // If the output is a 1x1 tensor, which commonly occurs at the end of a
159 // network, then we change the threading strategy to parallelise over
160 // channels rather than rows of the tensor.
161 if (n_threads > 1 && output_height == 1 && output_width == 1)
162 {
163 // Determine how many channels should be assigned to each thread, we
164 // round up first to ensure we get a reasonable spread across the
165 // threads.
166 const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads;
167 const auto start_channel = thread_id * channels_per_thread;
168 const auto end_channel = std::min(start_channel + channels_per_thread, n_channels);
169
170 if (start_channel >= end_channel)
171 {
172 // This thread should move on if we have insufficient work to do.
173 return;
174 }
175
176 for (; n_batches; n_batches--)
177 {
178 // We know we don't need to iterate over rows or columns here; so just
179 // execute the tile.
180 this->compute_tile_padded(
181 0, 0, // Compute the only output point
182 start_channel, end_channel,
183 input_tensor, output_tensor, thread_working_space
184 );
185
186 // Progress the pointers for the next batch.
187 input_tensor.base += ld_input_batch;
188 output_tensor.base += ld_output_batch;
189 }
190
191 // Exit here, since we've done all the work using the different strategy.
192 return;
193 }
194
195 for (unsigned int batch = 0; batch < n_batches; batch++)
196 {
197 // Iterate over rows of the output tensor; we stripe over the tiles.
198 for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
199 start_output_i < output_height;
200 start_output_i += n_threads * m_strat->get_output_rows())
201 {
202 // Determine what (if any padding) is required on the top/bottom of
203 // this row of the convolution.
204 const auto end_output_i = start_output_i + m_strat->get_output_rows();
205 const bool pad_output_bottom = output_height < end_output_i;
206
207 const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top;
208 const bool pad_input_top = start_input_i < 0;
209 const int end_input_i = start_input_i + m_strat->get_input_rows();
210 const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
211 const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
212
213 // Iterate over the columns of the output tensor; we attempt to grab as
214 // much as possible of the unpadded regions, so the loop structure is a
215 // bit odd.
216 unsigned int start_output_j = 0;
217 while (start_output_j < output_width)
218 {
219 const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left;
220 const bool pad_input_left = start_in_j < 0;
221
222 // Determine if we can process a number of unpadded tiles in one go.
223 int n_unpadded_tiles = 0;
224 if (!pad_input_left)
225 {
226 // Determine the maximum number of tiles we could handle.
227 n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
228
229 // Handle padding on the right hand edge
230 const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols;
231 int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
232 int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
233
234 while (n_unpadded_tiles > 0 &&
235 (static_cast<int>(output_width) < end_output_j ||
236 static_cast<int>(input_width) < end_input_j))
237 {
238 n_unpadded_tiles--;
239 end_output_j -= m_strat->get_output_cols();
240 end_input_j -= tile_stride;
241 }
242 }
243
244 // Process unpadded tiles, if possible, otherwise process a padded tile.
245 if (n_unpadded_tiles)
246 {
247 if (!pad_row)
248 {
249 // Completely unpadded execution
250 this->compute_tiles_unpadded(
251 start_output_i, start_output_j,
252 1, n_unpadded_tiles, // Compute a row of unpadded tiles
253 0, n_channels, // Compute all channels
254 input_tensor, output_tensor, thread_working_space
255 );
256 }
257 else
258 {
259 // Top/bottom padding only
260 this->compute_row_padded_tile_row(
261 start_output_i, start_output_j, n_unpadded_tiles,
262 0, n_channels, // Compute all channels
263 input_tensor, output_tensor, thread_working_space
264 );
265 }
266 start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
267 }
268 else
269 {
270 this->compute_tile_padded(
271 start_output_i, start_output_j,
272 0, n_channels, // Compute all channels
273 input_tensor, output_tensor, thread_working_space
274 );
275 start_output_j += m_strat->get_output_cols();
276 }
277 }
278 }
279
280 // Progress the pointers for the next batch.
281 input_tensor.base += ld_input_batch;
282 output_tensor.base += ld_output_batch;
283 }
284 }
285
286 public:
287 DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args)
288 : Parent(args), m_strat(strategy)
289 {
290 }
291
292 size_t get_working_size(unsigned int n_threads) const override
293 {
294 return this->get_working_size(n_threads, this->m_args.n_channels);
295 }
296
297 size_t get_working_size(unsigned int n_threads, unsigned int n_channels) const override final
298 {
299 return n_threads * this->get_working_size_per_thread(n_channels);
300 }
301};
302
303} // namespace pooling
304} // namespace arm_conv