blob: 07c582059f8a2a609348de2b0b28a96b3edce558 [file] [log] [blame]
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +00001/*
Michael Tyler74921ee2023-04-12 17:43:17 +01002 * Copyright (c) 2021-2023 Arm Limited.
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
ramelg01c827e992022-04-08 03:52:28 +010027#include "depthfirst_driver.hpp"
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000028#include "utils.hpp"
Pablo Marquez Tello84a09412022-06-30 17:00:09 +010029#if !defined(_WIN64) && !defined(__OpenBSD__)
ramelg01c827e992022-04-08 03:52:28 +010030#include <alloca.h>
Pablo Marquez Tello84a09412022-06-30 17:00:09 +010031#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000032
33namespace arm_conv {
34namespace pooling {
35
ramelg01c827e992022-04-08 03:52:28 +010036template <typename TInput, typename TOutput, typename OutputStage = Nothing>
37class IGenericDepthfirstStrategy;
38
39template <typename TInput, typename TOutput>
40class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000041{
ramelg01c827e992022-04-08 03:52:28 +010042 public:
43 virtual ~IGenericDepthfirstStrategy() = default;
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000044
ramelg01c827e992022-04-08 03:52:28 +010045 typedef void (*KernelType)(
46 uint64_t window_cells,
47 uint64_t n_valid_cells,
48 uint64_t n_channels,
49 const TInput *const *,
50 TOutput *
51 );
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000052
ramelg01c827e992022-04-08 03:52:28 +010053 virtual KernelType get_kernel(void) const = 0;
54};
55
56template <typename TInput, typename TOutput>
57class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
58{
59 public:
60 virtual ~IGenericDepthfirstStrategy() = default;
61
62 typedef void (*KernelType)(
63 uint64_t window_cells,
64 uint64_t n_valid_cells,
65 uint64_t n_channels,
66 const TInput *const *,
67 TOutput *,
68 const Requantize32 &
69 );
70
71 virtual KernelType get_kernel(void) const = 0;
72};
73
74template <typename TInput, typename TOutput, typename OutputStage>
75struct Invoker;
76
77template <typename TInput, typename TOutput>
78struct Invoker<TInput, TOutput, Nothing>
79{
80 static inline void invoke(
81 const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
82 uint64_t window_cells,
83 uint64_t n_valid_cells,
84 uint64_t n_channels,
85 const TInput *const *inptrs,
86 TOutput *outptr,
87 const Nothing &
88 )
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000089 {
ramelg01c827e992022-04-08 03:52:28 +010090 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
91 }
92};
93
94template <typename TInput, typename TOutput>
95struct Invoker<TInput, TOutput, Requantize32>
96{
97 static inline void invoke(
98 const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
99 uint64_t window_cells,
100 uint64_t n_valid_cells,
101 uint64_t n_channels,
102 const TInput *const *inptrs,
103 TOutput *outptr,
104 const Requantize32 &qp
105 )
106 {
107 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
108 }
109};
110
111template <typename TInput, typename TOutput, typename OutputStage>
112class GenericDepthfirstWrapper : public IDepthfirstStrategy
113{
114 using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
115
116 std::unique_ptr<const StratType> m_strat;
117 const unsigned int window_rows, window_cols;
118
119 public:
120 GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
121 : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
122 {
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000123 }
124
ramelg01c827e992022-04-08 03:52:28 +0100125 unsigned int get_input_rows(void) const override { return window_rows; }
126 unsigned int get_input_cols(void) const override { return window_cols; }
127 unsigned int get_output_rows(void) const override { return 1; }
128 unsigned int get_output_cols(void) const override { return 1; }
129
130 typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
131};
132
133template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
134class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
135{
136 const OutputStage m_os;
137
138 protected:
139 size_t get_working_size_per_thread(unsigned int) const override { return 0; }
140 void initialise_working_space(void *, unsigned int) const override { /* Nothing */ }
141
142 /* Compute a portion of the output tensor with padding. */
143 void compute_tile_padded(
144 unsigned int output_i, unsigned int output_j,
145 unsigned int channel_start, unsigned int channel_end,
146 const TensorSpec<const TInput *> &input,
147 const TensorSpec<TOutput *> &output,
148 void *
149 ) const override
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000150 {
ramelg01c827e992022-04-08 03:52:28 +0100151 // Determine start position and padding
152 const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
153 const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
154 const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
155 const int end_i = start_i + this->m_args.pool_window.rows;
156 const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
157 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
158
159 const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
160 const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
161 const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
162 const int end_j = start_j + this->m_args.pool_window.cols;
163 const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
164 const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
165
166 // Determine the number of valid cells and prepare the pointers
167 const auto n_valid_cells = valid_rows * valid_cols;
168 auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
169 {
170 auto my_ptr = inptrs;
171 auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
172 for (auto i = valid_rows; i; i--)
173 {
174 auto ptr = row_ptr;
175 row_ptr += input.ld_row;
176
177 for (auto j = valid_cols; j; j--)
178 {
179 *(my_ptr++) = ptr;
180 ptr += input.ld_col;
181 }
182 }
183 }
184
185 auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
186
187 // Some padding variants include (or exclude) the padding values; we handle
188 // this by computing the extent of the padded input tensor and hence
189 // computing the total number of cells captured in the pooling window.
190 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
191 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
192 const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
193 const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
194 const auto captured_cells = captured_rows * captured_cols;
195 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
196
197 // Execute the kernel
198 Invoker<TInput, TOutput, OutputStage>::invoke(
199 reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
200 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
201 );
202 }
203
204 // Compute a portion of the work with only top/bottom padding.
205 void compute_row_padded_tile_row(
206 const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
207 const unsigned int channel_start, const unsigned int channel_end,
208 const TensorSpec<const TInput *> &input,
209 const TensorSpec<TOutput *> &output,
Michael Tyler74921ee2023-04-12 17:43:17 +0100210 void *
ramelg01c827e992022-04-08 03:52:28 +0100211 ) const override
212 {
ramelg01c827e992022-04-08 03:52:28 +0100213 // Determine start position and padding
214 const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
215 const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
216 const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
217 const int end_i = start_i + this->m_args.pool_window.rows;
218 const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
219 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
220
221 const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
222 const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
223 const auto valid_cols = this->m_args.pool_window.cols;
224
225 // Determine the number of valid cells and prepare the pointers
226 const auto n_valid_cells = valid_rows * valid_cols;
227 auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
228 {
229 auto my_ptr = inptrs;
230 auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
231 for (auto i = valid_rows; i; i--)
232 {
233 auto ptr = row_ptr;
234 row_ptr += input.ld_row;
235
236 for (auto j = valid_cols; j; j--)
237 {
238 *(my_ptr++) = ptr;
239 ptr += input.ld_col;
240 }
241 }
242 }
243
244 auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
245
246 // Some padding variants include (or exclude) the padding values; we handle
247 // this by computing the extent of the padded input tensor and hence
248 // computing the total number of cells captured in the pooling window.
249 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
250 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
251 const auto captured_cells = captured_rows * valid_cols;
252 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
253
254 for (; n_tile_cols; n_tile_cols--)
255 {
256 // Execute the kernel
257 Invoker<TInput, TOutput, OutputStage>::invoke(
258 reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
259 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
260 );
261
262 // Update the pointers; the output strides by a column and the inputs
263 // stride by a number of columns.
264 outptr += output.ld_col;
265 for (auto n = 0u; n < n_valid_cells; n++)
266 {
267 inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
268 }
269 }
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000270 }
271
272 public:
ramelg01c827e992022-04-08 03:52:28 +0100273 PoolingDepthfirstGeneric(
274 const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
275 const PoolingArgs &args,
276 const OutputStage &os = {}
277 )
278 : DepthfirstDriver<TInput, TOutput>(
279 new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
280 args
281 ),
282 m_os(os)
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000283 {
284 }
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000285};
286
287} // namespace pooling
288} // namespace arm_conv