Blame - arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp - ml/ComputeLibrary

blob: b33f2768adbf4c723f21e4eb7282ad3e2d8d48cc [file] [log] [blame]

Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	1	/*
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	2	* Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include <algorithm>
				34	#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
				35	#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
				36
				37	#pragma once
				38
				39	namespace depthwise
				40	{
				41
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	42	const unsigned int CHANNEL_BLOCK = 16;
				43
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	44	namespace
				45	{
				46	inline int pad_along_dim(
				47	const bool padding_same,
				48	const int kernel_dim,
				49	const int stride_dim,
				50	const int input_dim
				51	)
				52	{
				53	if (!padding_same)
				54	return 0;
				55	if (input_dim % stride_dim)
				56	return std::max(kernel_dim - (input_dim % stride_dim), 0);
				57	else
				58	return std::max(kernel_dim - stride_dim, 0);
				59	}
				60	} // namespace
				61
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	62	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				63	int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
				64	const int dim_size, const bool same_padding
				65	)
				66	{
				67	return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
				68	}
				69
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	70	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				71	int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
				72	const int dim_size, const unsigned int padding_before, const unsigned int padding_after
				73	)
				74	{
				75	return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR);
				76	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	77
				78	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				79	DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
				80	const int n_batches, const int n_input_rows, const int n_input_cols,
				81	const int n_channels, const bool padding_same,
				82	const TIn* const weights,
				83	const TIn* const input,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	84	TOut* const output,
				85	const int weight_col_stride,
				86	const int weight_row_stride,
				87	const int input_col_stride,
				88	const int input_row_stride,
				89	const int input_batch_stride,
				90	const int output_col_stride,
				91	const int output_row_stride,
				92	const int output_batch_stride
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	93	) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>(
				94	n_batches, n_input_rows, n_input_cols,
				95	n_channels,
				96	pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */
				97	pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */
				98	iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */
				99	iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */
				100	weights, input, output,
				101	weight_col_stride, weight_row_stride,
				102	input_col_stride, input_row_stride, input_batch_stride,
				103	output_col_stride, output_row_stride, output_batch_stride
				104	)
				105	{
				106	}
				107
				108
				109	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				110	DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
				111	const int n_batches, const int n_input_rows, const int n_input_cols,
				112	const int n_channels,
				113	const unsigned int padding_top,
				114	const unsigned int padding_left,
				115	const unsigned int padding_bottom,
				116	const unsigned int padding_right,
				117	const TIn* const weights,
				118	const TIn* const input,
				119	TOut* const output,
				120	const int weight_col_stride,
				121	const int weight_row_stride,
				122	const int input_col_stride,
				123	const int input_row_stride,
				124	const int input_batch_stride,
				125	const int output_col_stride,
				126	const int output_row_stride,
				127	const int output_batch_stride
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	128	) : _weights(weights), _input(input), _output(output),
				129	_n_batches(n_batches),
				130	_n_input_rows(n_input_rows),
				131	_n_input_cols(n_input_cols),
				132	_n_channels(n_channels),
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	133	_n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)),
				134	_n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)),
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	135	_n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
				136	_n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	137	_padding_top(padding_top),
				138	_padding_left(padding_left),
				139	_padding_bottom(padding_bottom),
				140	_padding_right(padding_right),
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	141	_weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
				142	_weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
				143	_input_col_stride(input_col_stride ? input_col_stride : _n_channels),
				144	_input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
				145	_input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
				146	_output_col_stride(output_col_stride ? output_col_stride : _n_channels),
				147	_output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	148	_output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
				149	_input_offset(0), _weights_offset(0)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	150	{
				151	}
				152
				153
				154	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				155	unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
				156	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	157	// Parallelise over blocks of channels.
				158	return iceildiv(_n_channels, CHANNEL_BLOCK);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	159	}
				160
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	161	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				162	void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
				163	{
				164	_input_offset = input_offset;
				165	_weights_offset = weights_offset;
				166	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	167
				168	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				169	void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
				170	const unsigned int start,
				171	const unsigned int stop
				172	)
				173	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	174	// Parallelise over blocks of channels
				175	const auto start_channel = CHANNEL_BLOCK * start;
				176	const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	177
				178	// Compute top and bottom padding for input and output
Georgios Pinitas	1a57ad1	2019-01-09 16:11:51 +0000	[diff] [blame^]	179	const int input_pad_top = _padding_top;
				180	const int input_pad_left = _padding_left;
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	181	constexpr int tile_overlap = kernel_rows - stride_rows;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	182
				183	// Perform the convolution by calling `process_tile_row` for each tile row in
				184	// each batch.
				185	for (int batch = 0; batch < _n_batches; batch++)
				186	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	187	const TIn* const inptr_batch = _input + batch*_input_batch_stride;
				188	TOut* const outptr_batch = _output + batch*_output_batch_stride;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	189
				190	// Loop over rows of tiles
				191	for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
				192	{
				193	// Pointer to the row
				194	const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	195	const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)tile_i - input_row_offset)_input_row_stride);
				196	TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	197
				198	// Input padding (top + bottom) for the row
				199	const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
				200	const int input_row_bottom = input_row_top + inner_tile_rows;
				201	const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
				202	const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
				203
				204	// Output padding (bottom) for the row
				205	const int output_row_bottom = (tile_i + 1)*output_tile_rows;
				206	const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
				207
				208	// Process the row
				209	process_tile_row(
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	210	stop_channel - start_channel,
				211	_weights + start_channel, _weight_row_stride, _weight_col_stride,
				212	inptr_row + start_channel, _input_row_stride, _input_col_stride,
				213	outptr_row + start_channel, _output_row_stride, _output_col_stride,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	214	input_row_pad_top, input_pad_left, input_row_pad_bottom,
				215	output_row_pad_bottom,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	216	_n_tile_cols, _n_input_cols, _n_output_cols,
				217	_input_offset, _weights_offset
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	218	);
				219	}
				220	}
				221	}
				222
				223
				224	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				225	void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
				226	const int n_channels,
				227	const TIn* const weights,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	228	const int weight_row_stride,
				229	const int weight_col_stride,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	230	const TIn* const inptr,
				231	const int in_row_stride,
				232	const int in_col_stride,
				233	TOut* const outptr,
				234	const int out_row_stride,
				235	const int out_col_stride,
				236	const int row_pad_in_top,
				237	const int row_pad_in_left,
				238	const int row_pad_in_bottom,
				239	const int row_pad_out_bottom,
				240	const int n_tiles,
				241	const int n_input_cols,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	242	const int n_output_cols,
				243	const int input_offset,
				244	const int weights_offset
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	245	)
				246	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	247	constexpr int tile_overlap = kernel_cols - stride_cols;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	248
				249	// Loop over columns of tiles
				250	for (int tile_j = 0; tile_j < n_tiles; tile_j++)
				251	{
				252	// Input padding (left + right) for the tile
				253	const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
				254	const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
				255	const int t_in_end = t_in_start + inner_tile_cols;
				256	const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
				257
				258	// Output padding (right) for the tile
				259	const int t_out_end = (tile_j + 1) * output_tile_cols;
				260	const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
				261
				262	// Get pointers into the inputs and outputs
				263	const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
				264	const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)tile_j - col_offset)in_col_stride);
				265	TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
				266
				267	// Apply the specific tile processing function
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	268	const bool pad_top = row_pad_in_top > 0;
				269	const bool pad_left = t_pad_in_left > 0;
				270	const bool pad_bottom = row_pad_in_bottom \|\| row_pad_out_bottom;
				271	const bool pad_right = t_pad_in_right \|\| t_pad_out_right;
				272
				273	const TileFn tilefn = [&] () {
				274	if (!pad_top && !pad_left && !pad_bottom && !pad_right)
				275	{
				276	// No padding
				277	return tilefn_unpadded;
				278	}
				279	else if (pad_top && !pad_left && !pad_bottom && !pad_right)
				280	{
				281	// Padding on the top only, subtract off the minimum expected padding in
				282	// order to index into the array of specialised methods.
				283	const int index = row_pad_in_top - min_in_pad_top;
				284	return tilefn_top[index];
				285	}
				286	else if (!pad_top && pad_left && !pad_bottom && !pad_right)
				287	{
				288	// Padding on the left only, subtract off the minimum expected padding in
				289	// order to index into the array of specialised methods.
				290	const int index = t_pad_in_left - min_in_pad_left;
				291	return tilefn_left[index];
				292	}
				293	else if (!pad_top && !pad_left && pad_bottom && !pad_right)
				294	{
				295	// Padding on the bottom only
				296	return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
				297	}
				298	else if (!pad_top && !pad_left && !pad_bottom && pad_right)
				299	{
				300	// Padding on the right only
				301	return tilefn_right[t_pad_in_right][t_pad_out_right];
				302	}
				303	else
				304	{
				305	// Otherwise use generic tile processing method.
				306	return tilefn_generic;
				307	}
				308	}();
				309
				310	tilefn(
				311	n_channels,
				312	weights, weight_row_stride, weight_col_stride,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	313	inptr_col, in_row_stride, in_col_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	314	outptr_col, out_row_stride, out_col_stride,
				315	row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	316	row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	317	);
				318	}
				319	}
				320
				321
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	322	// New templated struct used solely as a way to provide tile processing
				323	// specialisations.
				324	template <int OutputTileRows, int OutputTileCols,
				325	int KernelRows, int KernelCols,
				326	int StrideRows, int StrideCols,
				327	typename TIn, typename TOut>
				328	struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
				329	OutputTileRows, OutputTileCols,
				330	KernelRows, KernelCols,
				331	StrideRows, StrideCols, TIn, TOut
				332	>
				333	{
				334	typedef DepthwiseConvolution<
				335	OutputTileRows, OutputTileCols,
				336	KernelRows, KernelCols,
				337	StrideRows, StrideCols,
				338	TIn, TOut
				339	> DWC;
				340
				341	/** Perform the depthwise convolution of a tile.
				342	*
				343	* @param[in] n_channels Number of channels.
				344	* @param[in] weights Pointer to Height x Width x Channels ordered weights.
				345	* @param[in] inptr Pointer to the top-left unpadded value of the tile.
				346	* @param[in] in_row_stride Stride between rows of the input tensor.
				347	* @param[in] in_col_stride Stride between columns of the input tensor.
				348	* @param[out] outptr Pointer to the top-left output value for the tile.
				349	* @param[in] out_row_stride Stride between rows of the output tensor.
				350	* @param[in] out_col_stride Stride between columns of the output tensor.
				351	*
				352	* The following parameters may be ignored if the function has been
				353	* specialised for specific padding constraints.
				354	*
				355	* @param[in] _in_pad_top Padding to apply to top of input tile.
				356	* @param[in] _in_pad_left Padding to apply to left of input tile.
				357	* @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
				358	* @param[in] _in_pad_right Padding to apply to right of input tile.
				359	* @param[in] _out_pad_bottom Null cells at bottom of output tile.
				360	* @param[in] _out_pad_right Null cells at right of output tile.
				361	*/
				362	template <
				363	bool Specialize=false, // Specialize (or not) the method
				364	int InPadTop=0, // If specialized, top padding
				365	int InPadLeft=0, // If specialized, left padding
				366	int InPadBottom=0, // If specialized, bottom padding
				367	int InPadRight=0, // If specialized, right padding
				368	int OutPadBottom=0, // If specialized, bottom output padding
				369	int OutPadRight=0 // If specialized, bottom right padding
				370	>
				371	static void process_tile(
				372	const int n_channels,
				373	const TIn* const weights,
				374	const int weight_row_stride,
				375	const int weight_col_stride,
				376	const TIn* const inptr,
				377	const int in_row_stride,
				378	const int in_col_stride,
				379	TOut* const outptr,
				380	const int out_row_stride,
				381	const int out_col_stride,
				382	const int in_pad_top=0,
				383	const int in_pad_left=0,
				384	const int in_pad_bottom=0,
				385	const int in_pad_right=0,
				386	const int out_pad_bottom=0,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	387	const int out_pad_right=0,
				388	const int input_offset=0,
				389	const int weights_offset=0
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	390	);
				391	};
				392
				393
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	394	template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
				395	template <
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	396	bool Specialize,
				397	int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
				398	int OutPadBottom, int OutPadRight
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	399	>
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	400	void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	401	const int n_channels,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	402	const TIn *__restrict__ const weights,
				403	const int weight_row_stride,
				404	const int weight_col_stride,
				405	const TIn *__restrict__ const inptr,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	406	const int in_row_stride,
				407	const int in_col_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	408	TOut *__restrict__ const outptr,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	409	const int out_row_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	410	const int out_col_stride,
				411	const int _in_pad_top,
				412	const int _in_pad_left,
				413	const int _in_pad_bottom,
				414	const int _in_pad_right,
				415	const int _out_pad_bottom,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	416	const int _out_pad_right,
				417	const int _input_offset,
				418	const int _weights_offset
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	419	)
				420	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	421	constexpr auto inner_tile_rows = DWC::inner_tile_rows;
				422	constexpr auto inner_tile_cols = DWC::inner_tile_cols;
				423	constexpr auto kernel_rows = DWC::kernel_rows;
				424	constexpr auto kernel_cols = DWC::kernel_cols;
				425	constexpr auto output_tile_rows = DWC::output_tile_rows;
				426	constexpr auto output_tile_cols = DWC::output_tile_cols;
				427	constexpr auto stride_rows = DWC::stride_rows;
				428	constexpr auto stride_cols = DWC::stride_cols;
				429
				430	// Extract parameters
				431	const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
				432	const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
				433	const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
				434	const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
				435	const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
				436	const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
				437
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	438	// Compute valid ranges of the tile
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	439	const int in_cells_i = inner_tile_rows - in_pad_bottom;
				440	const int in_cells_j = inner_tile_cols - in_pad_right;
				441	const int out_cells_i = output_tile_rows - out_pad_bottom;
				442	const int out_cells_j = output_tile_cols - out_pad_right;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	443
				444	// Instantiate pointers
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	445	const TIn* __restrict__ inptr_base = inptr;
				446	const TIn* __restrict__ wptr_base = weights;
				447	TOut* __restrict__ outptr_base = outptr;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	448
				449	// Perform the depthwise convolution
				450	int channels_remaining = n_channels;
				451	for (; channels_remaining; channels_remaining--)
				452	{
				453	// Load input tile
				454	TIn u[inner_tile_rows][inner_tile_cols];
				455	for (int i = 0; i < inner_tile_rows; i++)
				456	{
				457	const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
				458	for (int j = 0; j < inner_tile_cols; j++)
				459	{
				460	if (i < in_pad_top \|\| in_cells_i <= i \|\|
				461	j < in_pad_left \|\| in_cells_j <= j)
				462	{
				463	u[i][j] = static_cast<TIn>(0);
				464	}
				465	else
				466	{
				467	u[i][j] = (inptr_row + (j - in_pad_left)in_col_stride);
				468	}
				469	}
				470	}
				471	inptr_base++;
				472
				473	// Load weights tile
				474	TIn w[kernel_rows][kernel_cols];
				475	for (int i = 0; i < kernel_rows; i++)
				476	{
				477	const TIn* const wptr_row = wptr_base + i*weight_row_stride;
				478	for (int j = 0; j < kernel_cols; j++)
				479	{
				480	w[i][j] = (wptr_row + jweight_col_stride);
				481	}
				482	}
				483	wptr_base++;
				484
				485	// Perform the convolution
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	486	TOut v[output_tile_rows][output_tile_cols];
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	487	for (int out_i = 0; out_i < out_cells_i; out_i++)
				488	{
				489	for (int out_j = 0; out_j < out_cells_j; out_j++)
				490	{
				491	// Clear the accumulator
				492	v[out_i][out_j] = static_cast<TOut>(0);
				493
				494	// Base co-ordinate
				495	const int base_i = out_i * stride_rows;
				496	const int base_j = out_j * stride_cols;
				497
				498	// Fill the accumulator
				499	for (int in_i = 0; in_i < kernel_rows; in_i++)
				500	{
				501	const int i = base_i + in_i;
				502	for (int in_j = 0; in_j < kernel_cols; in_j++)
				503	{
				504	const int j = base_j + in_j;
				505	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				506	}
				507	}
				508	}
				509	}
				510
				511	// Store the output tile
				512	for (int i = 0; i < out_cells_i; i++)
				513	{
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	514	TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	515	for (int j = 0; j < out_cells_j; j++)
				516	{
				517	(outptr_row + jout_col_stride) = v[i][j];
				518	}
				519	}
				520	outptr_base++;
				521	}
				522	}
				523
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	524	} // namespace depthwise