blob: 674fc4d2df0950f9a831998cc616b67d92eae6ed [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Georgios Pinitas1a57ad12019-01-09 16:11:51 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <algorithm>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000034#include <cstdint>
Georgios Pinitas4074c992018-01-30 18:13:46 +000035#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000036#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000037#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
38
39#pragma once
40
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000041#define MEMBERFN(TOUT) template <\
42 unsigned int OutputTileRows, unsigned int OutputTileColumns,\
43 unsigned int KernelRows, unsigned int KernelColumns,\
44 unsigned int StrideRows, unsigned int StrideColumns,\
45 typename TIn, typename TBias, typename TOut,\
46 typename Derived\
47> TOUT DepthwiseConvolutionBase<\
48 OutputTileRows, OutputTileColumns,\
49 KernelRows, KernelColumns,\
50 StrideRows, StrideColumns,\
51 TIn, TBias, TOut, Derived\
52>
53
54using namespace neon_convolution_kernels;
55
Georgios Pinitas4074c992018-01-30 18:13:46 +000056namespace depthwise
57{
58
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000059template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
60struct PackParameters
61{
62 static void execute(
63 unsigned int n_channels,
64 void *buffer,
65 const void *weights,
66 unsigned int weight_row_stride,
67 unsigned int weight_col_stride,
68 const void *biases
69 );
70};
71
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000072const unsigned int CHANNEL_BLOCK = 16;
73
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000074MEMBERFN(int)::get_output_size(
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000075 const int dim_size, const unsigned int padding_before, const unsigned int padding_after
76)
77{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000078 return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000079}
Georgios Pinitas4074c992018-01-30 18:13:46 +000080
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000081MEMBERFN(int)::output_size(
82 const int dim_size, const unsigned int padding_before, const unsigned int padding_after
83) const
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000084{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000085 return get_output_size(dim_size, padding_before, padding_after);
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000086}
87
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000088MEMBERFN()::DepthwiseConvolutionBase(
89 const int n_batches,
90 const int n_input_rows,
91 const int n_input_cols,
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000092 const int n_channels,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000093 ActivationFunction activation,
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000094 const unsigned int padding_top,
95 const unsigned int padding_left,
96 const unsigned int padding_bottom,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000097 const unsigned int padding_right
98) : _input(nullptr), _output(nullptr),
99 _packed_parameters(nullptr),
100 _working_space(nullptr),
Georgios Pinitas4074c992018-01-30 18:13:46 +0000101 _n_batches(n_batches),
102 _n_input_rows(n_input_rows),
103 _n_input_cols(n_input_cols),
104 _n_channels(n_channels),
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000105 _n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)),
106 _n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)),
Georgios Pinitas4074c992018-01-30 18:13:46 +0000107 _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
108 _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000109 _padding_top(padding_top),
110 _padding_left(padding_left),
111 _padding_bottom(padding_bottom),
112 _padding_right(padding_right),
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000113 _activation(activation),
114 _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
115 _input_ws_col_stride(_n_channels),
116 _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols),
117 _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0),
118 _output_ws_col_stride(_n_channels),
119 _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000120{
121}
122
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000123MEMBERFN(void)::set_input(const void* const inptr)
124{
125 set_input(inptr, _n_channels);
126}
Georgios Pinitas4074c992018-01-30 18:13:46 +0000127
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000128MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
129{
130 set_input(inptr, _n_input_cols * ld_col, ld_col);
131}
132
133MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
134{
135 set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
136}
137
138MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
139{
140 _input = static_cast<const TIn *>(inptr);
141 _input_batch_stride = ld_batch;
142 _input_row_stride = ld_row;
143 _input_col_stride = ld_col;
144}
145
146MEMBERFN(void)::set_output(void* const outptr)
147{
148 set_output(outptr, _n_channels);
149}
150
151MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
152{
153 set_output(outptr, _n_output_cols * ld_col, ld_col);
154}
155
156MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
157{
158 set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
159}
160
161MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
162{
163 _output = static_cast<TOut *>(outptr);
164 _output_batch_stride = ld_batch;
165 _output_row_stride = ld_row;
166 _output_col_stride = ld_col;
167}
168
169MEMBERFN(size_t)::get_packed_params_size(void) const
170{
171 return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
172}
173
174MEMBERFN(void)::set_packed_params_buffer(void *buffer)
175{
176 _packed_parameters = buffer;
177}
178
179MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
180{
181 static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
182}
183
184MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
185{
186 const unsigned int weight_col_stride = _n_channels;
187 const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
188 static_cast<const Derived *>(this)->pack_params(
189 buffer, weights, weight_row_stride, weight_col_stride, biases
190 );
191}
192
193MEMBERFN(void)::pack_params(
194 void * const buffer,
195 const void * const weights,
196 const unsigned int weight_row_stride,
197 const unsigned int weight_col_stride,
198 const void * const biases
199) const
200{
201 static_cast<const Derived *>(this)->_pack_params(
202 buffer, weights, weight_row_stride, weight_col_stride, biases
203 );
204}
205
206MEMBERFN(void)::_pack_params(
207 void * const buffer,
208 const void * const weights,
209 const unsigned int weight_row_stride,
210 const unsigned int weight_col_stride,
211 const void * const biases
212) const
213{
214 // Default implementation
215 PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
216 _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
217 );
218}
219
220MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
221{
222 return nthreads * (
223 _get_input_working_space_size() + _get_output_working_space_size()
224 );
225}
226
227MEMBERFN(void)::set_working_space(void *buffer)
228{
229 _working_space = buffer;
230}
231
232MEMBERFN(size_t)::_get_input_working_space_size(void) const
233{
234 return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels;
235}
236
237MEMBERFN(size_t)::_get_output_working_space_size(void) const
238{
239 return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels;
240}
241
242MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
243{
244 return static_cast<uint8_t*>(_working_space) + threadid * (
245 _get_input_working_space_size() + _get_output_working_space_size()
246 );
247}
248
249MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
250{
251 return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
252}
253
254MEMBERFN(unsigned int)::get_window() const
Georgios Pinitas4074c992018-01-30 18:13:46 +0000255{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000256 // Parallelise over blocks of channels.
257 return iceildiv(_n_channels, CHANNEL_BLOCK);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000258}
259
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000260MEMBERFN(void)::run(
Georgios Pinitas4074c992018-01-30 18:13:46 +0000261 const unsigned int start,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000262 const unsigned int stop,
263 const unsigned int threadid
Georgios Pinitas4074c992018-01-30 18:13:46 +0000264)
265{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000266 // Parallelise over blocks of channels
267 const auto start_channel = CHANNEL_BLOCK * start;
268 const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000269
270 // Compute top and bottom padding for input and output
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000271 const int input_pad_top = _padding_top;
272 const int input_pad_left = _padding_left;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000273 constexpr int tile_overlap = kernel_rows - stride_rows;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000274
275 // Perform the convolution by calling `process_tile_row` for each tile row in
276 // each batch.
277 for (int batch = 0; batch < _n_batches; batch++)
278 {
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000279 const TIn* const inptr_batch = _input + batch*_input_batch_stride;
280 TOut* const outptr_batch = _output + batch*_output_batch_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000281
282 // Loop over rows of tiles
283 for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
284 {
285 // Pointer to the row
286 const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000287 const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
288 TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000289
290 // Input padding (top + bottom) for the row
291 const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
292 const int input_row_bottom = input_row_top + inner_tile_rows;
293 const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
294 const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
295
296 // Output padding (bottom) for the row
297 const int output_row_bottom = (tile_i + 1)*output_tile_rows;
298 const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
299
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000300 // Get the offset into the packed parameters
301 const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
302 start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
303
Georgios Pinitas4074c992018-01-30 18:13:46 +0000304 // Process the row
305 process_tile_row(
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000306 threadid,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000307 stop_channel - start_channel,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000308 params_ptr,
309 inptr_row + start_channel,
310 outptr_row + start_channel,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000311 input_row_pad_top, input_pad_left, input_row_pad_bottom,
312 output_row_pad_bottom,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000313 _n_tile_cols, _n_input_cols, _n_output_cols
Georgios Pinitas4074c992018-01-30 18:13:46 +0000314 );
315 }
316 }
317}
318
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000319MEMBERFN(void)::process_tile_row(
320 const unsigned int threadid,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000321 const int n_channels,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000322 const void* const packed_params,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000323 const TIn* const inptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000324 TOut* const outptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000325 const int row_pad_in_top,
326 const int row_pad_in_left,
327 const int row_pad_in_bottom,
328 const int row_pad_out_bottom,
329 const int n_tiles,
330 const int n_input_cols,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000331 const int n_output_cols
Georgios Pinitas4074c992018-01-30 18:13:46 +0000332)
333{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000334 constexpr int tile_overlap = kernel_cols - stride_cols;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000335
336 // Loop over columns of tiles
337 for (int tile_j = 0; tile_j < n_tiles; tile_j++)
338 {
339 // Input padding (left + right) for the tile
340 const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
341 const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
342 const int t_in_end = t_in_start + inner_tile_cols;
343 const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
344
345 // Output padding (right) for the tile
346 const int t_out_end = (tile_j + 1) * output_tile_cols;
347 const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
348
349 // Get pointers into the inputs and outputs
350 const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000351 const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
352 TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000353
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000354 // Process just this tile
355 process_tile(
356 threadid, n_channels, packed_params, inptr_col, outptr_col,
357 row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings
358 row_pad_out_bottom, t_pad_out_right // Output paddings
Georgios Pinitas4074c992018-01-30 18:13:46 +0000359 );
360 }
361}
362
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000363MEMBERFN(TIn)::_input_padding_value(void) const
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000364{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000365 return static_cast<TIn>(0);
366}
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000367
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000368MEMBERFN(void)::process_tile(
369 const unsigned int threadid,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000370 const int n_channels,
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000371 const void* const packed_params,
372 const TIn* const inptr,
373 TOut* const outptr,
374 const int pad_in_top,
375 const int pad_in_left,
376 const int pad_in_bottom,
377 const int pad_in_right,
378 const int pad_out_bottom,
379 const int pad_out_right
Georgios Pinitas4074c992018-01-30 18:13:46 +0000380)
381{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000382 const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
383 const bool pad_output = pad_out_bottom || pad_out_right;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000384
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000385 if (pad_input)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000386 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000387 // Copy the input into the temporary buffer, applying padding
388 padding::copy_and_pad_tile<TIn>(
389 inner_tile_rows, inner_tile_cols, n_channels,
390 inptr, _input_row_stride, _input_col_stride,
391 static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride,
392 pad_in_top, pad_in_left, pad_in_bottom, pad_in_right,
393 static_cast<Derived *>(this)->_input_padding_value()
394 );
Georgios Pinitas4074c992018-01-30 18:13:46 +0000395 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000396
397 // Execute the kernel
398 const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid));
399 const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride;
400 const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride;
401
402 TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid));
403 const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride;
404 const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride;
405
406 Derived * dthis = static_cast<Derived *>(this);
407
408 switch(_activation)
409 {
410 case ActivationFunction::ReLU:
411 dthis->template execute_tile<ActivationFunction::ReLU>(
412 n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
413 );
414 break;
415 case ActivationFunction::ReLU6:
416 dthis->template execute_tile<ActivationFunction::ReLU6>(
417 n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
418 );
419 break;
420 default:
421 dthis->template execute_tile<ActivationFunction::None>(
422 n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
423 );
424 break;
425 }
426
427 if (pad_output)
428 {
429 // Copy the output from the temporary buffer, removing unnecessary values
430 padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute(
431 n_channels * sizeof(TOut),
432 _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut),
433 outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut),
434 0, 0, pad_out_bottom, pad_out_right
435 );
436 }
437}
438
439MEMBERFN(int)::n_channels(void) const
440{
441 return _n_channels;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000442}
443
Georgios Pinitas4074c992018-01-30 18:13:46 +0000444} // namespace depthwise