blob: 17889849dbcbc22a4eb8f31247cbce15ac5e62dc [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <algorithm>
34#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
35#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
36
37#pragma once
38
39namespace depthwise
40{
41
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000042const unsigned int CHANNEL_BLOCK = 16;
43
Georgios Pinitas4074c992018-01-30 18:13:46 +000044template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
45int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
46 const int dim_size, const bool same_padding
47)
48{
49 return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
50}
51
52
53template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
54DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
55 const int n_batches, const int n_input_rows, const int n_input_cols,
56 const int n_channels, const bool padding_same,
57 const TIn* const weights,
58 const TIn* const input,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000059 TOut* const output,
60 const int weight_col_stride,
61 const int weight_row_stride,
62 const int input_col_stride,
63 const int input_row_stride,
64 const int input_batch_stride,
65 const int output_col_stride,
66 const int output_row_stride,
67 const int output_batch_stride
Georgios Pinitas4074c992018-01-30 18:13:46 +000068) : _weights(weights), _input(input), _output(output),
69 _n_batches(n_batches),
70 _n_input_rows(n_input_rows),
71 _n_input_cols(n_input_cols),
72 _n_channels(n_channels),
73 _n_output_rows(get_output_size(n_input_rows, padding_same)),
74 _n_output_cols(get_output_size(n_input_cols, padding_same)),
75 _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
76 _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000077 _padding_same(padding_same),
78 _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
79 _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
80 _input_col_stride(input_col_stride ? input_col_stride : _n_channels),
81 _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
82 _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
83 _output_col_stride(output_col_stride ? output_col_stride : _n_channels),
84 _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
85 _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride)
Georgios Pinitas4074c992018-01-30 18:13:46 +000086{
87}
88
89
90template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
91unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
92{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000093 // Parallelise over blocks of channels.
94 return iceildiv(_n_channels, CHANNEL_BLOCK);
Georgios Pinitas4074c992018-01-30 18:13:46 +000095}
96
97
98template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
99void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
100 const unsigned int start,
101 const unsigned int stop
102)
103{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000104 // Parallelise over blocks of channels
105 const auto start_channel = CHANNEL_BLOCK * start;
106 const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000107
108 // Compute top and bottom padding for input and output
109 const int input_pad_top = _padding_same ?
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000110 ((_n_output_rows - 1)*stride_rows + kernel_rows - _n_input_rows) / 2 : 0;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000111 const int input_pad_left = _padding_same ?
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000112 ((_n_output_cols - 1)*stride_cols + kernel_cols - _n_input_cols) / 2 : 0;
113 constexpr int tile_overlap = kernel_rows - stride_rows;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000114
115 // Perform the convolution by calling `process_tile_row` for each tile row in
116 // each batch.
117 for (int batch = 0; batch < _n_batches; batch++)
118 {
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000119 const TIn* const inptr_batch = _input + batch*_input_batch_stride;
120 TOut* const outptr_batch = _output + batch*_output_batch_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000121
122 // Loop over rows of tiles
123 for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
124 {
125 // Pointer to the row
126 const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000127 const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
128 TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000129
130 // Input padding (top + bottom) for the row
131 const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
132 const int input_row_bottom = input_row_top + inner_tile_rows;
133 const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
134 const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
135
136 // Output padding (bottom) for the row
137 const int output_row_bottom = (tile_i + 1)*output_tile_rows;
138 const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
139
140 // Process the row
141 process_tile_row(
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000142 stop_channel - start_channel,
143 _weights + start_channel, _weight_row_stride, _weight_col_stride,
144 inptr_row + start_channel, _input_row_stride, _input_col_stride,
145 outptr_row + start_channel, _output_row_stride, _output_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000146 input_row_pad_top, input_pad_left, input_row_pad_bottom,
147 output_row_pad_bottom,
148 _n_tile_cols, _n_input_cols, _n_output_cols
149 );
150 }
151 }
152}
153
154
155template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
156void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
157 const int n_channels,
158 const TIn* const weights,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000159 const int weight_row_stride,
160 const int weight_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000161 const TIn* const inptr,
162 const int in_row_stride,
163 const int in_col_stride,
164 TOut* const outptr,
165 const int out_row_stride,
166 const int out_col_stride,
167 const int row_pad_in_top,
168 const int row_pad_in_left,
169 const int row_pad_in_bottom,
170 const int row_pad_out_bottom,
171 const int n_tiles,
172 const int n_input_cols,
173 const int n_output_cols
174)
175{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000176 constexpr int tile_overlap = kernel_cols - stride_cols;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000177
178 // Loop over columns of tiles
179 for (int tile_j = 0; tile_j < n_tiles; tile_j++)
180 {
181 // Input padding (left + right) for the tile
182 const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
183 const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
184 const int t_in_end = t_in_start + inner_tile_cols;
185 const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
186
187 // Output padding (right) for the tile
188 const int t_out_end = (tile_j + 1) * output_tile_cols;
189 const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
190
191 // Get pointers into the inputs and outputs
192 const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
193 const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride);
194 TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
195
196 // Apply the specific tile processing function
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000197 const bool pad_top = row_pad_in_top > 0;
198 const bool pad_left = t_pad_in_left > 0;
199 const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom;
200 const bool pad_right = t_pad_in_right || t_pad_out_right;
201
202 const TileFn tilefn = [&] () {
203 if (!pad_top && !pad_left && !pad_bottom && !pad_right)
204 {
205 // No padding
206 return tilefn_unpadded;
207 }
208 else if (pad_top && !pad_left && !pad_bottom && !pad_right)
209 {
210 // Padding on the top only, subtract off the minimum expected padding in
211 // order to index into the array of specialised methods.
212 const int index = row_pad_in_top - min_in_pad_top;
213 return tilefn_top[index];
214 }
215 else if (!pad_top && pad_left && !pad_bottom && !pad_right)
216 {
217 // Padding on the left only, subtract off the minimum expected padding in
218 // order to index into the array of specialised methods.
219 const int index = t_pad_in_left - min_in_pad_left;
220 return tilefn_left[index];
221 }
222 else if (!pad_top && !pad_left && pad_bottom && !pad_right)
223 {
224 // Padding on the bottom only
225 return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
226 }
227 else if (!pad_top && !pad_left && !pad_bottom && pad_right)
228 {
229 // Padding on the right only
230 return tilefn_right[t_pad_in_right][t_pad_out_right];
231 }
232 else
233 {
234 // Otherwise use generic tile processing method.
235 return tilefn_generic;
236 }
237 }();
238
239 tilefn(
240 n_channels,
241 weights, weight_row_stride, weight_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000242 inptr_col, in_row_stride, in_col_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000243 outptr_col, out_row_stride, out_col_stride,
244 row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
245 row_pad_out_bottom, t_pad_out_right
Georgios Pinitas4074c992018-01-30 18:13:46 +0000246 );
247 }
248}
249
250
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000251// New templated struct used solely as a way to provide tile processing
252// specialisations.
253template <int OutputTileRows, int OutputTileCols,
254 int KernelRows, int KernelCols,
255 int StrideRows, int StrideCols,
256 typename TIn, typename TOut>
257struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
258 OutputTileRows, OutputTileCols,
259 KernelRows, KernelCols,
260 StrideRows, StrideCols, TIn, TOut
261>
262{
263 typedef DepthwiseConvolution<
264 OutputTileRows, OutputTileCols,
265 KernelRows, KernelCols,
266 StrideRows, StrideCols,
267 TIn, TOut
268 > DWC;
269
270 /** Perform the depthwise convolution of a tile.
271 *
272 * @param[in] n_channels Number of channels.
273 * @param[in] weights Pointer to Height x Width x Channels ordered weights.
274 * @param[in] inptr Pointer to the top-left unpadded value of the tile.
275 * @param[in] in_row_stride Stride between rows of the input tensor.
276 * @param[in] in_col_stride Stride between columns of the input tensor.
277 * @param[out] outptr Pointer to the top-left output value for the tile.
278 * @param[in] out_row_stride Stride between rows of the output tensor.
279 * @param[in] out_col_stride Stride between columns of the output tensor.
280 *
281 * The following parameters may be ignored if the function has been
282 * specialised for specific padding constraints.
283 *
284 * @param[in] _in_pad_top Padding to apply to top of input tile.
285 * @param[in] _in_pad_left Padding to apply to left of input tile.
286 * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
287 * @param[in] _in_pad_right Padding to apply to right of input tile.
288 * @param[in] _out_pad_bottom Null cells at bottom of output tile.
289 * @param[in] _out_pad_right Null cells at right of output tile.
290 */
291 template <
292 bool Specialize=false, // Specialize (or not) the method
293 int InPadTop=0, // If specialized, top padding
294 int InPadLeft=0, // If specialized, left padding
295 int InPadBottom=0, // If specialized, bottom padding
296 int InPadRight=0, // If specialized, right padding
297 int OutPadBottom=0, // If specialized, bottom output padding
298 int OutPadRight=0 // If specialized, bottom right padding
299 >
300 static void process_tile(
301 const int n_channels,
302 const TIn* const weights,
303 const int weight_row_stride,
304 const int weight_col_stride,
305 const TIn* const inptr,
306 const int in_row_stride,
307 const int in_col_stride,
308 TOut* const outptr,
309 const int out_row_stride,
310 const int out_col_stride,
311 const int in_pad_top=0,
312 const int in_pad_left=0,
313 const int in_pad_bottom=0,
314 const int in_pad_right=0,
315 const int out_pad_bottom=0,
316 const int out_pad_right=0
317 );
318};
319
320
Georgios Pinitas4074c992018-01-30 18:13:46 +0000321template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
322template <
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000323 bool Specialize,
324 int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
325 int OutPadBottom, int OutPadRight
Georgios Pinitas4074c992018-01-30 18:13:46 +0000326>
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000327void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
Georgios Pinitas4074c992018-01-30 18:13:46 +0000328 const int n_channels,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000329 const TIn *__restrict__ const weights,
330 const int weight_row_stride,
331 const int weight_col_stride,
332 const TIn *__restrict__ const inptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000333 const int in_row_stride,
334 const int in_col_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000335 TOut *__restrict__ const outptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000336 const int out_row_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000337 const int out_col_stride,
338 const int _in_pad_top,
339 const int _in_pad_left,
340 const int _in_pad_bottom,
341 const int _in_pad_right,
342 const int _out_pad_bottom,
343 const int _out_pad_right
Georgios Pinitas4074c992018-01-30 18:13:46 +0000344)
345{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000346 constexpr auto inner_tile_rows = DWC::inner_tile_rows;
347 constexpr auto inner_tile_cols = DWC::inner_tile_cols;
348 constexpr auto kernel_rows = DWC::kernel_rows;
349 constexpr auto kernel_cols = DWC::kernel_cols;
350 constexpr auto output_tile_rows = DWC::output_tile_rows;
351 constexpr auto output_tile_cols = DWC::output_tile_cols;
352 constexpr auto stride_rows = DWC::stride_rows;
353 constexpr auto stride_cols = DWC::stride_cols;
354
355 // Extract parameters
356 const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
357 const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
358 const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
359 const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
360 const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
361 const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
362
Georgios Pinitas4074c992018-01-30 18:13:46 +0000363 // Compute valid ranges of the tile
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000364 const int in_cells_i = inner_tile_rows - in_pad_bottom;
365 const int in_cells_j = inner_tile_cols - in_pad_right;
366 const int out_cells_i = output_tile_rows - out_pad_bottom;
367 const int out_cells_j = output_tile_cols - out_pad_right;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000368
369 // Instantiate pointers
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000370 const TIn* __restrict__ inptr_base = inptr;
371 const TIn* __restrict__ wptr_base = weights;
372 TOut* __restrict__ outptr_base = outptr;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000373
374 // Perform the depthwise convolution
375 int channels_remaining = n_channels;
376 for (; channels_remaining; channels_remaining--)
377 {
378 // Load input tile
379 TIn u[inner_tile_rows][inner_tile_cols];
380 for (int i = 0; i < inner_tile_rows; i++)
381 {
382 const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
383 for (int j = 0; j < inner_tile_cols; j++)
384 {
385 if (i < in_pad_top || in_cells_i <= i ||
386 j < in_pad_left || in_cells_j <= j)
387 {
388 u[i][j] = static_cast<TIn>(0);
389 }
390 else
391 {
392 u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
393 }
394 }
395 }
396 inptr_base++;
397
398 // Load weights tile
399 TIn w[kernel_rows][kernel_cols];
400 for (int i = 0; i < kernel_rows; i++)
401 {
402 const TIn* const wptr_row = wptr_base + i*weight_row_stride;
403 for (int j = 0; j < kernel_cols; j++)
404 {
405 w[i][j] = *(wptr_row + j*weight_col_stride);
406 }
407 }
408 wptr_base++;
409
410 // Perform the convolution
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000411 TOut v[output_tile_rows][output_tile_cols];
Georgios Pinitas4074c992018-01-30 18:13:46 +0000412 for (int out_i = 0; out_i < out_cells_i; out_i++)
413 {
414 for (int out_j = 0; out_j < out_cells_j; out_j++)
415 {
416 // Clear the accumulator
417 v[out_i][out_j] = static_cast<TOut>(0);
418
419 // Base co-ordinate
420 const int base_i = out_i * stride_rows;
421 const int base_j = out_j * stride_cols;
422
423 // Fill the accumulator
424 for (int in_i = 0; in_i < kernel_rows; in_i++)
425 {
426 const int i = base_i + in_i;
427 for (int in_j = 0; in_j < kernel_cols; in_j++)
428 {
429 const int j = base_j + in_j;
430 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
431 }
432 }
433 }
434 }
435
436 // Store the output tile
437 for (int i = 0; i < out_cells_i; i++)
438 {
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000439 TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000440 for (int j = 0; j < out_cells_j; j++)
441 {
442 *(outptr_row + j*out_col_stride) = v[i][j];
443 }
444 }
445 outptr_base++;
446 }
447}
448
Georgios Pinitas4074c992018-01-30 18:13:46 +0000449} // namespace depthwise