blob: b33f2768adbf4c723f21e4eb7282ad3e2d8d48cc [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Georgios Pinitas1a57ad12019-01-09 16:11:51 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <algorithm>
34#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
35#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
36
37#pragma once
38
39namespace depthwise
40{
41
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000042const unsigned int CHANNEL_BLOCK = 16;
43
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000044namespace
45{
46 inline int pad_along_dim(
47 const bool padding_same,
48 const int kernel_dim,
49 const int stride_dim,
50 const int input_dim
51 )
52 {
53 if (!padding_same)
54 return 0;
55 if (input_dim % stride_dim)
56 return std::max(kernel_dim - (input_dim % stride_dim), 0);
57 else
58 return std::max(kernel_dim - stride_dim, 0);
59 }
60} // namespace
61
Georgios Pinitas4074c992018-01-30 18:13:46 +000062template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
63int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
64 const int dim_size, const bool same_padding
65)
66{
67 return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
68}
69
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000070template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
71int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
72 const int dim_size, const unsigned int padding_before, const unsigned int padding_after
73)
74{
75 return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR);
76}
Georgios Pinitas4074c992018-01-30 18:13:46 +000077
78template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
79DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
80 const int n_batches, const int n_input_rows, const int n_input_cols,
81 const int n_channels, const bool padding_same,
82 const TIn* const weights,
83 const TIn* const input,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000084 TOut* const output,
85 const int weight_col_stride,
86 const int weight_row_stride,
87 const int input_col_stride,
88 const int input_row_stride,
89 const int input_batch_stride,
90 const int output_col_stride,
91 const int output_row_stride,
92 const int output_batch_stride
Georgios Pinitas1a57ad12019-01-09 16:11:51 +000093) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>(
94 n_batches, n_input_rows, n_input_cols,
95 n_channels,
96 pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */
97 pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */
98 iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */
99 iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */
100 weights, input, output,
101 weight_col_stride, weight_row_stride,
102 input_col_stride, input_row_stride, input_batch_stride,
103 output_col_stride, output_row_stride, output_batch_stride
104)
105{
106}
107
108
109template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
110DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
111 const int n_batches, const int n_input_rows, const int n_input_cols,
112 const int n_channels,
113 const unsigned int padding_top,
114 const unsigned int padding_left,
115 const unsigned int padding_bottom,
116 const unsigned int padding_right,
117 const TIn* const weights,
118 const TIn* const input,
119 TOut* const output,
120 const int weight_col_stride,
121 const int weight_row_stride,
122 const int input_col_stride,
123 const int input_row_stride,
124 const int input_batch_stride,
125 const int output_col_stride,
126 const int output_row_stride,
127 const int output_batch_stride
Georgios Pinitas4074c992018-01-30 18:13:46 +0000128) : _weights(weights), _input(input), _output(output),
129 _n_batches(n_batches),
130 _n_input_rows(n_input_rows),
131 _n_input_cols(n_input_cols),
132 _n_channels(n_channels),
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000133 _n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)),
134 _n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)),
Georgios Pinitas4074c992018-01-30 18:13:46 +0000135 _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
136 _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000137 _padding_top(padding_top),
138 _padding_left(padding_left),
139 _padding_bottom(padding_bottom),
140 _padding_right(padding_right),
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000141 _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
142 _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
143 _input_col_stride(input_col_stride ? input_col_stride : _n_channels),
144 _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
145 _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
146 _output_col_stride(output_col_stride ? output_col_stride : _n_channels),
147 _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100148 _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
149 _input_offset(0), _weights_offset(0)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000150{
151}
152
153
154template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
155unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
156{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000157 // Parallelise over blocks of channels.
158 return iceildiv(_n_channels, CHANNEL_BLOCK);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000159}
160
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100161template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
162void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
163{
164 _input_offset = input_offset;
165 _weights_offset = weights_offset;
166}
Georgios Pinitas4074c992018-01-30 18:13:46 +0000167
168template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
169void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
170 const unsigned int start,
171 const unsigned int stop
172)
173{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000174 // Parallelise over blocks of channels
175 const auto start_channel = CHANNEL_BLOCK * start;
176 const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000177
178 // Compute top and bottom padding for input and output
Georgios Pinitas1a57ad12019-01-09 16:11:51 +0000179 const int input_pad_top = _padding_top;
180 const int input_pad_left = _padding_left;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000181 constexpr int tile_overlap = kernel_rows - stride_rows;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000182
183 // Perform the convolution by calling `process_tile_row` for each tile row in
184 // each batch.
185 for (int batch = 0; batch < _n_batches; batch++)
186 {
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000187 const TIn* const inptr_batch = _input + batch*_input_batch_stride;
188 TOut* const outptr_batch = _output + batch*_output_batch_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000189
190 // Loop over rows of tiles
191 for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
192 {
193 // Pointer to the row
194 const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000195 const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
196 TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000197
198 // Input padding (top + bottom) for the row
199 const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
200 const int input_row_bottom = input_row_top + inner_tile_rows;
201 const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
202 const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
203
204 // Output padding (bottom) for the row
205 const int output_row_bottom = (tile_i + 1)*output_tile_rows;
206 const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
207
208 // Process the row
209 process_tile_row(
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000210 stop_channel - start_channel,
211 _weights + start_channel, _weight_row_stride, _weight_col_stride,
212 inptr_row + start_channel, _input_row_stride, _input_col_stride,
213 outptr_row + start_channel, _output_row_stride, _output_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000214 input_row_pad_top, input_pad_left, input_row_pad_bottom,
215 output_row_pad_bottom,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100216 _n_tile_cols, _n_input_cols, _n_output_cols,
217 _input_offset, _weights_offset
Georgios Pinitas4074c992018-01-30 18:13:46 +0000218 );
219 }
220 }
221}
222
223
224template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
225void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
226 const int n_channels,
227 const TIn* const weights,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000228 const int weight_row_stride,
229 const int weight_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000230 const TIn* const inptr,
231 const int in_row_stride,
232 const int in_col_stride,
233 TOut* const outptr,
234 const int out_row_stride,
235 const int out_col_stride,
236 const int row_pad_in_top,
237 const int row_pad_in_left,
238 const int row_pad_in_bottom,
239 const int row_pad_out_bottom,
240 const int n_tiles,
241 const int n_input_cols,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100242 const int n_output_cols,
243 const int input_offset,
244 const int weights_offset
Georgios Pinitas4074c992018-01-30 18:13:46 +0000245)
246{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000247 constexpr int tile_overlap = kernel_cols - stride_cols;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000248
249 // Loop over columns of tiles
250 for (int tile_j = 0; tile_j < n_tiles; tile_j++)
251 {
252 // Input padding (left + right) for the tile
253 const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
254 const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
255 const int t_in_end = t_in_start + inner_tile_cols;
256 const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
257
258 // Output padding (right) for the tile
259 const int t_out_end = (tile_j + 1) * output_tile_cols;
260 const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
261
262 // Get pointers into the inputs and outputs
263 const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
264 const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride);
265 TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
266
267 // Apply the specific tile processing function
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000268 const bool pad_top = row_pad_in_top > 0;
269 const bool pad_left = t_pad_in_left > 0;
270 const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom;
271 const bool pad_right = t_pad_in_right || t_pad_out_right;
272
273 const TileFn tilefn = [&] () {
274 if (!pad_top && !pad_left && !pad_bottom && !pad_right)
275 {
276 // No padding
277 return tilefn_unpadded;
278 }
279 else if (pad_top && !pad_left && !pad_bottom && !pad_right)
280 {
281 // Padding on the top only, subtract off the minimum expected padding in
282 // order to index into the array of specialised methods.
283 const int index = row_pad_in_top - min_in_pad_top;
284 return tilefn_top[index];
285 }
286 else if (!pad_top && pad_left && !pad_bottom && !pad_right)
287 {
288 // Padding on the left only, subtract off the minimum expected padding in
289 // order to index into the array of specialised methods.
290 const int index = t_pad_in_left - min_in_pad_left;
291 return tilefn_left[index];
292 }
293 else if (!pad_top && !pad_left && pad_bottom && !pad_right)
294 {
295 // Padding on the bottom only
296 return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
297 }
298 else if (!pad_top && !pad_left && !pad_bottom && pad_right)
299 {
300 // Padding on the right only
301 return tilefn_right[t_pad_in_right][t_pad_out_right];
302 }
303 else
304 {
305 // Otherwise use generic tile processing method.
306 return tilefn_generic;
307 }
308 }();
309
310 tilefn(
311 n_channels,
312 weights, weight_row_stride, weight_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000313 inptr_col, in_row_stride, in_col_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000314 outptr_col, out_row_stride, out_col_stride,
315 row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100316 row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
Georgios Pinitas4074c992018-01-30 18:13:46 +0000317 );
318 }
319}
320
321
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000322// New templated struct used solely as a way to provide tile processing
323// specialisations.
324template <int OutputTileRows, int OutputTileCols,
325 int KernelRows, int KernelCols,
326 int StrideRows, int StrideCols,
327 typename TIn, typename TOut>
328struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
329 OutputTileRows, OutputTileCols,
330 KernelRows, KernelCols,
331 StrideRows, StrideCols, TIn, TOut
332>
333{
334 typedef DepthwiseConvolution<
335 OutputTileRows, OutputTileCols,
336 KernelRows, KernelCols,
337 StrideRows, StrideCols,
338 TIn, TOut
339 > DWC;
340
341 /** Perform the depthwise convolution of a tile.
342 *
343 * @param[in] n_channels Number of channels.
344 * @param[in] weights Pointer to Height x Width x Channels ordered weights.
345 * @param[in] inptr Pointer to the top-left unpadded value of the tile.
346 * @param[in] in_row_stride Stride between rows of the input tensor.
347 * @param[in] in_col_stride Stride between columns of the input tensor.
348 * @param[out] outptr Pointer to the top-left output value for the tile.
349 * @param[in] out_row_stride Stride between rows of the output tensor.
350 * @param[in] out_col_stride Stride between columns of the output tensor.
351 *
352 * The following parameters may be ignored if the function has been
353 * specialised for specific padding constraints.
354 *
355 * @param[in] _in_pad_top Padding to apply to top of input tile.
356 * @param[in] _in_pad_left Padding to apply to left of input tile.
357 * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
358 * @param[in] _in_pad_right Padding to apply to right of input tile.
359 * @param[in] _out_pad_bottom Null cells at bottom of output tile.
360 * @param[in] _out_pad_right Null cells at right of output tile.
361 */
362 template <
363 bool Specialize=false, // Specialize (or not) the method
364 int InPadTop=0, // If specialized, top padding
365 int InPadLeft=0, // If specialized, left padding
366 int InPadBottom=0, // If specialized, bottom padding
367 int InPadRight=0, // If specialized, right padding
368 int OutPadBottom=0, // If specialized, bottom output padding
369 int OutPadRight=0 // If specialized, bottom right padding
370 >
371 static void process_tile(
372 const int n_channels,
373 const TIn* const weights,
374 const int weight_row_stride,
375 const int weight_col_stride,
376 const TIn* const inptr,
377 const int in_row_stride,
378 const int in_col_stride,
379 TOut* const outptr,
380 const int out_row_stride,
381 const int out_col_stride,
382 const int in_pad_top=0,
383 const int in_pad_left=0,
384 const int in_pad_bottom=0,
385 const int in_pad_right=0,
386 const int out_pad_bottom=0,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100387 const int out_pad_right=0,
388 const int input_offset=0,
389 const int weights_offset=0
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000390 );
391};
392
393
Georgios Pinitas4074c992018-01-30 18:13:46 +0000394template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
395template <
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000396 bool Specialize,
397 int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
398 int OutPadBottom, int OutPadRight
Georgios Pinitas4074c992018-01-30 18:13:46 +0000399>
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000400void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
Georgios Pinitas4074c992018-01-30 18:13:46 +0000401 const int n_channels,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000402 const TIn *__restrict__ const weights,
403 const int weight_row_stride,
404 const int weight_col_stride,
405 const TIn *__restrict__ const inptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000406 const int in_row_stride,
407 const int in_col_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000408 TOut *__restrict__ const outptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000409 const int out_row_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000410 const int out_col_stride,
411 const int _in_pad_top,
412 const int _in_pad_left,
413 const int _in_pad_bottom,
414 const int _in_pad_right,
415 const int _out_pad_bottom,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100416 const int _out_pad_right,
417 const int _input_offset,
418 const int _weights_offset
Georgios Pinitas4074c992018-01-30 18:13:46 +0000419)
420{
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000421 constexpr auto inner_tile_rows = DWC::inner_tile_rows;
422 constexpr auto inner_tile_cols = DWC::inner_tile_cols;
423 constexpr auto kernel_rows = DWC::kernel_rows;
424 constexpr auto kernel_cols = DWC::kernel_cols;
425 constexpr auto output_tile_rows = DWC::output_tile_rows;
426 constexpr auto output_tile_cols = DWC::output_tile_cols;
427 constexpr auto stride_rows = DWC::stride_rows;
428 constexpr auto stride_cols = DWC::stride_cols;
429
430 // Extract parameters
431 const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
432 const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
433 const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
434 const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
435 const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
436 const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
437
Georgios Pinitas4074c992018-01-30 18:13:46 +0000438 // Compute valid ranges of the tile
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000439 const int in_cells_i = inner_tile_rows - in_pad_bottom;
440 const int in_cells_j = inner_tile_cols - in_pad_right;
441 const int out_cells_i = output_tile_rows - out_pad_bottom;
442 const int out_cells_j = output_tile_cols - out_pad_right;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000443
444 // Instantiate pointers
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000445 const TIn* __restrict__ inptr_base = inptr;
446 const TIn* __restrict__ wptr_base = weights;
447 TOut* __restrict__ outptr_base = outptr;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000448
449 // Perform the depthwise convolution
450 int channels_remaining = n_channels;
451 for (; channels_remaining; channels_remaining--)
452 {
453 // Load input tile
454 TIn u[inner_tile_rows][inner_tile_cols];
455 for (int i = 0; i < inner_tile_rows; i++)
456 {
457 const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
458 for (int j = 0; j < inner_tile_cols; j++)
459 {
460 if (i < in_pad_top || in_cells_i <= i ||
461 j < in_pad_left || in_cells_j <= j)
462 {
463 u[i][j] = static_cast<TIn>(0);
464 }
465 else
466 {
467 u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
468 }
469 }
470 }
471 inptr_base++;
472
473 // Load weights tile
474 TIn w[kernel_rows][kernel_cols];
475 for (int i = 0; i < kernel_rows; i++)
476 {
477 const TIn* const wptr_row = wptr_base + i*weight_row_stride;
478 for (int j = 0; j < kernel_cols; j++)
479 {
480 w[i][j] = *(wptr_row + j*weight_col_stride);
481 }
482 }
483 wptr_base++;
484
485 // Perform the convolution
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000486 TOut v[output_tile_rows][output_tile_cols];
Georgios Pinitas4074c992018-01-30 18:13:46 +0000487 for (int out_i = 0; out_i < out_cells_i; out_i++)
488 {
489 for (int out_j = 0; out_j < out_cells_j; out_j++)
490 {
491 // Clear the accumulator
492 v[out_i][out_j] = static_cast<TOut>(0);
493
494 // Base co-ordinate
495 const int base_i = out_i * stride_rows;
496 const int base_j = out_j * stride_cols;
497
498 // Fill the accumulator
499 for (int in_i = 0; in_i < kernel_rows; in_i++)
500 {
501 const int i = base_i + in_i;
502 for (int in_j = 0; in_j < kernel_cols; in_j++)
503 {
504 const int j = base_j + in_j;
505 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
506 }
507 }
508 }
509 }
510
511 // Store the output tile
512 for (int i = 0; i < out_cells_i; i++)
513 {
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000514 TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000515 for (int j = 0; j < out_cells_j; j++)
516 {
517 *(outptr_row + j*out_col_stride) = v[i][j];
518 }
519 }
520 outptr_base++;
521 }
522}
523
Georgios Pinitas4074c992018-01-30 18:13:46 +0000524} // namespace depthwise