blob: 63333c8fb49c474ec458ad18cf346ad86d15fb05 [file] [log] [blame]
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +00001/*
ramelg01c827e992022-04-08 03:52:28 +01002 * Copyright (c) 2021-2022 Arm Limited.
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
ramelg01c827e992022-04-08 03:52:28 +010027#include "depthfirst_driver.hpp"
28#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000029#include "utils.hpp"
Pablo Marquez Tello84a09412022-06-30 17:00:09 +010030#if !defined(_WIN64) && !defined(__OpenBSD__)
ramelg01c827e992022-04-08 03:52:28 +010031#include <alloca.h>
Pablo Marquez Tello84a09412022-06-30 17:00:09 +010032#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000033#include <limits>
34
35namespace arm_conv {
36namespace pooling {
37
ramelg01c827e992022-04-08 03:52:28 +010038template <typename TInput, typename TOutput>
39class DepthfirstStrategy : public IDepthfirstStrategy
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000040{
ramelg01c827e992022-04-08 03:52:28 +010041 unsigned int input_rows, input_cols, output_rows, output_cols;
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000042
ramelg01c827e992022-04-08 03:52:28 +010043 public:
44 DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
45 unsigned int stride_rows, unsigned int stride_cols,
46 unsigned int output_rows, unsigned int output_cols)
47 : input_rows(output_rows + (window_rows - 1) * stride_rows),
48 input_cols(output_cols + (window_cols - 1) * stride_cols),
49 output_rows(output_rows), output_cols(output_cols)
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000050 {
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000051 }
52
ramelg01c827e992022-04-08 03:52:28 +010053 unsigned int get_input_rows() const override { return input_rows; }
54 unsigned int get_input_cols() const override { return input_cols; }
55 unsigned int get_output_rows() const override { return output_rows; }
56 unsigned int get_output_cols() const override { return output_cols; }
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000057
ramelg01c827e992022-04-08 03:52:28 +010058 typedef void (*KernelType)(
59 unsigned int n_channels,
60 const TInput *const *,
61 TOutput *const *,
62 bool exclude_padding,
63 unsigned int pad_left,
64 unsigned int pad_top,
65 unsigned int pad_right,
66 unsigned int pad_bottom
67 );
68 virtual KernelType get_kernel(void) const = 0;
69};
70
71
72struct WorkingSpace
73{
74 void *input_buffer;
75 void *output_buffer;
76};
77
78
79template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
80class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
81{
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000082 size_t sizeof_input_buffer(void) const
83 {
ramelg01c827e992022-04-08 03:52:28 +010084 return sizeof(TInput) * this->m_args.n_channels;
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +000085 }
86
87 size_t sizeof_output_buffer(void) const
88 {
ramelg01c827e992022-04-08 03:52:28 +010089 return sizeof(TOutput) * this->m_args.n_channels;
90 }
91
92 protected:
93 /* Compute the amount of working space required for a single thread. */
94 size_t get_working_size_per_thread(unsigned int n_channels) const override
95 {
96 return sizeof(WorkingSpace) + n_channels * (sizeof(TInput) + sizeof(TOutput));
97 }
98
99 /* Initialise the working space for a thread. */
100 void initialise_working_space(void *raw_ws, unsigned int n_channels) const override
101 {
102 auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
103 ws->input_buffer = ws + 1;
104 ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels;
105
106 // Fill the input buffer with an appropriate value
107 TInput fill_val = 0;
108 if (this->m_args.pool_type == PoolingType::MAX)
109 {
110 using limits = std::numeric_limits<TInput>;
111 if (limits::has_infinity)
112 {
113 fill_val = -limits::infinity();
114 }
115 else
116 {
117 fill_val = limits::min();
118 }
119 }
120
121 auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
122 for (; n_channels; n_channels--)
123 {
124 *(ptr++) = fill_val;
125 }
126 }
127
128 /* Compute a portion of the output tensor with padding. */
129 void compute_tile_padded(
130 unsigned int output_i, unsigned int output_j,
131 unsigned int channel_start, unsigned int channel_end,
132 const TensorSpec<const TInput *> &input,
133 const TensorSpec<TOutput *> &output,
134 void *working_space
135 ) const override
136 {
137 const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
138 this->m_strat.get())->get_kernel();
139
140 // Get the working space, and some space on the stack for pointer arrays
141 auto ws = reinterpret_cast<WorkingSpace *>(working_space);
142 auto inptr_array = reinterpret_cast<const TInput **>(alloca(
143 sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
144 auto outptr_array = reinterpret_cast<TOutput **>(alloca(
145 sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
146
147 // Prepare the input pointers
148 const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
149 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
150 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
151
152 const unsigned int end_ii = ii + this->m_strat->get_input_rows();
153 const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
154
155 const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
156 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
157 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
158
159 const unsigned int end_ij = ij + this->m_strat->get_input_cols();
160 const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
161
162 fill_pointer_array<const TInput>(
163 inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
164 input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
165 input.ld_row, input.ld_col,
166 reinterpret_cast<const TInput *>(ws->input_buffer),
167 input_pad_top, this->m_args.input_rows - input_i,
168 input_pad_left, this->m_args.input_cols - input_j
169 );
170
171 // Prepare the output pointers
172 fill_pointer_array(
173 outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
174 output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
175 output.ld_row, output.ld_col,
176 reinterpret_cast<TOutput *>(ws->output_buffer),
177 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
178 0, this->m_args.output_cols - output_j // Left padding, # valid columns
179 );
180
181 // Call the kernel
182 kern(
183 channel_end - channel_start, inptr_array, outptr_array,
184 this->m_args.exclude_padding,
185 input_pad_left, input_pad_top,
186 input_pad_right, input_pad_bottom
187 );
188 }
189
190 // Compute a portion of the work with only top/bottom padding.
191 void compute_row_padded_tile_row(
192 const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
193 const unsigned int channel_start, const unsigned int channel_end,
194 const TensorSpec<const TInput *> &input,
195 const TensorSpec<TOutput *> &output,
196 void *working_space
197 ) const override
198 {
199 const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
200 this->m_strat.get())->get_kernel();
201
202 // Get the working space, and some space on the stack for pointer arrays
203 auto ws = reinterpret_cast<WorkingSpace *>(working_space);
204 auto inptr_array = reinterpret_cast<const TInput **>(alloca(
205 sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
206 auto outptr_array = reinterpret_cast<TOutput **>(alloca(
207 sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
208
209 // Prepare the initial input pointers
210 const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
211 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
212 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
213
214 const unsigned int end_ii = ii + this->m_strat->get_input_rows();
215 const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
216
217 const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
218 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
219
220 const auto end_oi = output_i + this->m_strat->get_output_cols();
221 const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
222
223 fill_pointer_array<const TInput>(
224 inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
225 input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
226 input.ld_row, input.ld_col,
227 reinterpret_cast<const TInput *>(ws->input_buffer),
228 input_pad_top, this->m_args.input_rows - input_i,
229 0, this->m_args.input_cols - input_j
230 );
231
232 // Prepare the initial output pointers
233 fill_pointer_array(
234 outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
235 output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
236 output.ld_row, output.ld_col,
237 reinterpret_cast<TOutput *>(ws->output_buffer),
238 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
239 0, this->m_args.output_cols - output_j // Left padding, # valid columns
240 );
241
242 // Call the kernel
243 for (; n_tile_cols; n_tile_cols--)
244 {
245 kern(
246 channel_end - channel_start, inptr_array, outptr_array,
247 this->m_args.exclude_padding,
248 0, input_pad_top,
249 0, input_pad_bottom
250 );
251
252 // Progress the input and output pointer arrays
253 const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
254 for (
255 auto n = input_pad_top * this->m_strat->get_input_cols();
256 n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
257 n++
258 )
259 {
260 inptr_array[n] += input_col_stride;
261 }
262
263 const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
264 for (
265 auto n = 0u;
266 n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
267 n++
268 )
269 {
270 outptr_array[n] += output_col_stride;
271 }
272 }
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000273 }
274
275 public:
ramelg01c827e992022-04-08 03:52:28 +0100276 PoolingDepthfirst(const DepthfirstStrategy<TInput, TOutput> *strat,
277 const PoolingArgs &args, const OutputStage &os = {})
278 : DepthfirstDriver<TInput, TOutput>(strat, args)
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000279 {
ramelg01c827e992022-04-08 03:52:28 +0100280 ARM_COMPUTE_UNUSED(os);
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +0000281 }
282};
283
284} // namespace pooling
285} // namespace arm_conv