blob: 4aabd957cd35c977ab7c4743d65ff2f4563b562e [file] [log] [blame]
Michele Di Giorgiod556d7b2020-10-27 10:56:31 +00001/*
2 * Copyright (c) 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#include "pool_common.hpp"
27
28#include <stack>
29#include <vector>
30
31namespace arm_conv {
32namespace pooling {
33
34template <class strategy>
35class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
36{
37 using TInput = typename strategy::operand_type;
38 using TOutput = typename strategy::return_type;
39
40 const PoolingArgs m_args; // Copy of arguments
41
42 constexpr static unsigned int input_rows(void)
43 {
44 return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
45 }
46
47 constexpr static unsigned int input_cols(void)
48 {
49 return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
50 }
51
52 size_t sizeof_input_buffer(void) const
53 {
54 return sizeof(TInput) * m_args.n_channels;
55 }
56
57 size_t sizeof_output_buffer(void) const
58 {
59 return sizeof(TOutput) * m_args.n_channels;
60 }
61
62 public:
63 PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
64 {
65 }
66
67 PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
68 PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
69
70 size_t get_working_size(void) const override
71 {
72 // We require an array of pointers for the inputs and outputs, a
73 // channel-length vector in which to dump surplus output, and a
74 // channel-length vector of padding values.
75 return sizeof_input_buffer() + sizeof_output_buffer();
76 }
77
78 void execute(
79 const void *const input,
80 void *const output,
81 void *const working_space
82 ) const override
83 {
84 const size_t ld_input_col = m_args.n_channels;
85 const size_t ld_input_row = ld_input_col * m_args.input_cols;
86 const size_t ld_input_batch = ld_input_row * m_args.input_rows;
87 const size_t ld_output_col = ld_input_col;
88 const size_t ld_output_row = ld_output_col * m_args.output_cols;
89 const size_t ld_output_batch = ld_output_row * m_args.output_rows;
90
91 execute(
92 input, ld_input_col, ld_input_row, ld_input_batch,
93 output, ld_output_col, ld_output_row, ld_output_batch,
94 working_space
95 );
96 }
97
98 void execute(
99 const void *const input,
100 size_t ld_input_col,
101 size_t ld_input_row,
102 size_t ld_input_batch,
103 void *const output,
104 size_t ld_output_col,
105 size_t ld_output_row,
106 size_t ld_output_batch,
107 void *const working_space
108 ) const override
109 {
110 execute(
111 m_args.n_batches, m_args.input_rows, m_args.input_cols,
112 m_args.n_channels,
113 input, ld_input_col, ld_input_row, ld_input_batch,
114 m_args.padding,
115 m_args.output_rows, m_args.output_cols,
116 output, ld_output_col, ld_output_row, ld_output_batch,
117 working_space
118 );
119 }
120
121 void execute(
122 unsigned int batches,
123 unsigned int input_height,
124 unsigned int input_width,
125 unsigned int channels,
126 const void *const _input,
127 size_t ld_input_col,
128 size_t ld_input_row,
129 size_t ld_input_batch,
130 const PaddingValues &padding,
131 unsigned int output_height,
132 unsigned int output_width,
133 void *const _output,
134 size_t ld_output_col,
135 size_t ld_output_row,
136 size_t ld_output_batch,
137 void *const _working_space
138 ) const override
139 {
140 strategy strat(m_args.cpu_info);
141#ifdef CYCLE_PROFILING
142 arm_gemm::profiler prof;
143#endif // CYCLE_PROFILING
144
145 // Cast input and output pointers into the right types
146 const TInput *const inptr = static_cast<const TInput *>(_input);
147 TOutput *const outptr = static_cast<TOutput *>(_output);
148
149 // Allocate portions of the working space
150 uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
151 TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
152 TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
153
154 // Fill the input buffer
155 const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
156 ? static_cast<TInput>(0)
157 : (std::numeric_limits<TInput>::has_infinity
158 ? -std::numeric_limits<TInput>::infinity()
159 : std::numeric_limits<TInput>::lowest());
160 for (unsigned int i = 0; i < channels; i++)
161 {
162 input_buffer[i] = pad_value;
163 }
164
165 // Keep subdividing the output plane across the longest dimension until we
166 // reach the size of the tile. Queue items for later processing. Note - we
167 // can determine the largest size of the queue a priori from the input
168 // tensor size, this would allow us to allocate memory within the working
169 // space and improve performance.
170 struct WorkItem
171 {
172 unsigned int output_i, output_j;
173 unsigned int output_height, output_width;
174
175 WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
176 : output_i(i), output_j(j), output_height(height), output_width(width) {}
177 };
178
179 auto execute = [&] (const WorkItem &item) {
180 // Create an array for the output pointers
181 TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
182 TOutput **const outptr_array = _outptr_array;
183
184 // Construct the output pointer array
185 {
186 const auto output_pad_right = strategy::out_rows() - item.output_width;
187 auto outptr_element = outptr_array;
188 auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
189
190 // Fill the array with pointers to the output buffer
191 for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
192 {
193 outptr_array[i] = output_buffer;
194 }
195
196 // Fill in the valid portion of the array
197 for (unsigned int i = 0; i < item.output_height; i++)
198 {
199 auto outptr_col = outptr_row;
200 for (unsigned int j = 0; j < item.output_width; j++)
201 {
202 *(outptr_element++) = outptr_col;
203 outptr_col += ld_output_col;
204 }
205 outptr_element += output_pad_right;
206 outptr_row += ld_output_row;
207 }
208 }
209
210 const int start_i = item.output_i * strategy::stride_rows() - padding.top;
211 const int end_i = start_i + input_rows();
212 const unsigned int pad_top = std::max(0, 0 - start_i);
213 const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
214
215 const int start_j = item.output_j * strategy::stride_cols() - padding.left;
216 const int end_j = start_j + input_cols();
217 const unsigned int pad_left = std::max(0, 0 - start_j);
218 const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
219
220 // Create an array for the input pointers
221 const TInput * _inptr_array[input_rows() * input_cols()];
222 const TInput **const inptr_array = _inptr_array;
223 {
224 const unsigned int row_padding = pad_top + pad_bottom;
225 const unsigned int valid_rows = input_rows() - row_padding;
226
227 const unsigned int col_padding = pad_left + pad_right;
228 const unsigned int valid_cols = input_cols() - col_padding;
229
230 // Fill the array with pointers to the input buffer
231 for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
232 {
233 inptr_array[i] = input_buffer;
234 }
235
236 // Compute valid initial pointer
237 auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
238
239 // Fill in the valid portion of the input array
240 auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
241 for (unsigned int i = 0; i < valid_rows; i++)
242 {
243 auto inptr_col = inptr_row;
244 for (unsigned int j = 0; j < valid_cols; j++)
245 {
246 *(inptr_element++) = inptr_col;
247 inptr_col += ld_input_col;
248 }
249
250 inptr_row += ld_input_row;
251 inptr_element += col_padding; // Skip the padding elements
252 }
253 }
254
255 // Call the kernel
256#ifdef CYCLE_PROFILING
257 // TODO Work number
258 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
259#endif // CYCLE_PROFILING
260 strat.kernel(channels, inptr_array, outptr_array,
261 pad_left, pad_top, pad_right, pad_bottom);
262 };
263
264 // Add the initial work item to the stack of work.
265 std::stack<WorkItem, std::vector<WorkItem>> stack;
266 stack.push(WorkItem(0, 0, output_height, output_width));
267 while (!stack.empty())
268 {
269 // Pop an item from the stack, bisect the largest dimension and either
270 // execute the resulting tiles or add them to the stack if they are too
271 // large.
272 const WorkItem item(stack.top());
273 stack.pop();
274
275 if (item.output_height <= strategy::out_rows() &&
276 item.output_width <= strategy::out_cols())
277 {
278 execute(item);
279 }
280 else
281 {
282 // Split the largest dimension, such that we get an exact number of
283 // tiles in the first partition.
284 if (item.output_height >= item.output_width)
285 {
286 const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
287 const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
288
289 const unsigned int height_first = tiles_first * strategy::out_rows();
290 const unsigned int height_second = item.output_height - height_first;
291
292 stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
293 stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
294 }
295 else
296 {
297 const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
298 const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
299
300 const unsigned int width_first = tiles_first * strategy::out_cols();
301 const unsigned int width_second = item.output_width - width_first;
302
303 stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
304 stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
305 }
306 }
307 }
308 }
309};
310
311} // namespace pooling
312} // namespace arm_conv