Blame - src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp - ml/ComputeLibrary

blob: 8473fc0838a9cf67e90cb44e870c4d39f4775d89 [file] [log] [blame]

ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
				27	#include "pooling.hpp"
				28	#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
				29
				30	namespace arm_conv {
				31	namespace pooling {
				32
				33	class IDepthfirstStrategy
				34	{
				35	public:
				36	virtual ~IDepthfirstStrategy() = default;
				37
				38	virtual unsigned int get_input_rows() const = 0;
				39	virtual unsigned int get_input_cols() const = 0;
				40
				41	virtual unsigned int get_output_rows() const = 0;
				42	virtual unsigned int get_output_cols() const = 0;
				43	};
				44
				45
				46	template <typename T>
				47	struct TensorSpec
				48	{
				49	T base;
				50	size_t ld_row, ld_col;
				51
				52	TensorSpec(T ptr, size_t ld_row, size_t ld_col)
				53	: base(ptr), ld_row(ld_row), ld_col(ld_col) {}
				54	};
				55
				56
				57	template <typename TInput, typename TOutput>
				58	class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
				59	{
				60	protected:
				61	using Parent = PoolingCommon<TInput, TOutput>;
				62
				63	// The strategy which we're applying to solve the pooling problem.
				64	std::unique_ptr<const IDepthfirstStrategy> m_strat;
				65
				66	/* Compute the amount of working space required for a single thread. */
				67	virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0;
				68
				69	/* Initialise the working space for a thread. */
				70	virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0;
				71
				72	/* Compute a portion of the output tensor with padding. */
				73	virtual void compute_tile_padded(
				74	unsigned int output_i, unsigned int output_j,
				75	unsigned int output_channel_start, unsigned int output_channel_end,
				76	const TensorSpec<const TInput *> &input,
				77	const TensorSpec<TOutput *> &output,
				78	void *working_space
				79	) const = 0;
				80
				81	/* Compute a portion of the work with only top/bottom padding.
				82	*
				83	* The default implementation of this repeatedly calls into the padded tile
				84	* variant.
				85	*/
				86	virtual void compute_row_padded_tile_row(
				87	const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
				88	const unsigned int output_channel_start, const unsigned int output_channel_end,
				89	const TensorSpec<const TInput *> &input,
				90	const TensorSpec<TOutput *> &output,
				91	void *working_space
				92	) const
				93	{
				94	for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
				95	{
				96	this->compute_tile_padded(
				97	output_i, output_j, output_channel_start, output_channel_end,
				98	input, output, working_space
				99	);
				100	}
				101	}
				102
				103	/* Compute a portion of the output tensor with no padding.
				104	*
				105	* The default implementation of this repeatedly calls into the padded
				106	* variant.
				107	*/
				108	virtual void compute_tiles_unpadded(
				109	unsigned int start_output_i, unsigned int start_output_j,
				110	unsigned int n_tile_rows, unsigned int n_tile_cols,
				111	unsigned int output_channel_start, unsigned int output_channel_end,
				112	const TensorSpec<const TInput *> &input,
				113	const TensorSpec<TOutput *> &output,
				114	void *working_space
				115	) const
				116	{
				117	for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
				118	{
				119	this->compute_row_padded_tile_row(
				120	start_output_i, start_output_j, n_tile_cols,
				121	output_channel_start, output_channel_end,
				122	input, output, working_space
				123	);
				124	start_output_i += m_strat->get_output_rows();
				125	}
				126	}
				127
				128	void execute_internal(
				129	unsigned int n_batches,
				130	unsigned int input_height,
				131	unsigned int input_width,
				132	unsigned int n_channels,
				133	const PaddingValues &padding,
				134	const void *input,
				135	size_t ld_input_col,
				136	size_t ld_input_row,
				137	size_t ld_input_batch,
				138	unsigned int output_height,
				139	unsigned int output_width,
				140	void *output,
				141	size_t ld_output_col,
				142	size_t ld_output_row,
				143	size_t ld_output_batch,
				144	void *working_space,
				145	unsigned int thread_id,
				146	unsigned int n_threads
				147	) const override
				148	{
				149	// Get and initialise the working space for this thread.
				150	void *thread_working_space =
				151	static_cast<uint8_t >(working_space) + thread_id this->get_working_size_per_thread(n_channels);
				152	this->initialise_working_space(thread_working_space, n_channels);
				153
				154	// Construct convenient representations of the input/output tensors.
				155	TensorSpec<const TInput > input_tensor(reinterpret_cast<const TInput >(input), ld_input_row, ld_input_col);
				156	TensorSpec<TOutput > output_tensor(reinterpret_cast<TOutput >(output), ld_output_row, ld_output_col);
				157
				158	// If the output is a 1x1 tensor, which commonly occurs at the end of a
				159	// network, then we change the threading strategy to parallelise over
				160	// channels rather than rows of the tensor.
				161	if (n_threads > 1 && output_height == 1 && output_width == 1)
				162	{
				163	// Determine how many channels should be assigned to each thread, we
				164	// round up first to ensure we get a reasonable spread across the
				165	// threads.
				166	const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads;
				167	const auto start_channel = thread_id * channels_per_thread;
				168	const auto end_channel = std::min(start_channel + channels_per_thread, n_channels);
				169
				170	if (start_channel >= end_channel)
				171	{
				172	// This thread should move on if we have insufficient work to do.
				173	return;
				174	}
				175
				176	for (; n_batches; n_batches--)
				177	{
				178	// We know we don't need to iterate over rows or columns here; so just
				179	// execute the tile.
				180	this->compute_tile_padded(
				181	0, 0, // Compute the only output point
				182	start_channel, end_channel,
				183	input_tensor, output_tensor, thread_working_space
				184	);
				185
				186	// Progress the pointers for the next batch.
				187	input_tensor.base += ld_input_batch;
				188	output_tensor.base += ld_output_batch;
				189	}
				190
				191	// Exit here, since we've done all the work using the different strategy.
				192	return;
				193	}
				194
				195	for (unsigned int batch = 0; batch < n_batches; batch++)
				196	{
				197	// Iterate over rows of the output tensor; we stripe over the tiles.
				198	for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
				199	start_output_i < output_height;
				200	start_output_i += n_threads * m_strat->get_output_rows())
				201	{
				202	// Determine what (if any padding) is required on the top/bottom of
				203	// this row of the convolution.
				204	const auto end_output_i = start_output_i + m_strat->get_output_rows();
				205	const bool pad_output_bottom = output_height < end_output_i;
				206
				207	const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top;
				208	const bool pad_input_top = start_input_i < 0;
				209	const int end_input_i = start_input_i + m_strat->get_input_rows();
				210	const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
				211	const bool pad_row = pad_input_top \|\| pad_input_bottom \|\| pad_output_bottom;
				212
				213	// Iterate over the columns of the output tensor; we attempt to grab as
				214	// much as possible of the unpadded regions, so the loop structure is a
				215	// bit odd.
				216	unsigned int start_output_j = 0;
				217	while (start_output_j < output_width)
				218	{
				219	const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left;
				220	const bool pad_input_left = start_in_j < 0;
				221
				222	// Determine if we can process a number of unpadded tiles in one go.
				223	int n_unpadded_tiles = 0;
				224	if (!pad_input_left)
				225	{
				226	// Determine the maximum number of tiles we could handle.
				227	n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
				228
				229	// Handle padding on the right hand edge
				230	const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols;
				231	int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
				232	int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
				233
				234	while (n_unpadded_tiles > 0 &&
				235	(static_cast<int>(output_width) < end_output_j \|\|
				236	static_cast<int>(input_width) < end_input_j))
				237	{
				238	n_unpadded_tiles--;
				239	end_output_j -= m_strat->get_output_cols();
				240	end_input_j -= tile_stride;
				241	}
				242	}
				243
				244	// Process unpadded tiles, if possible, otherwise process a padded tile.
				245	if (n_unpadded_tiles)
				246	{
				247	if (!pad_row)
				248	{
				249	// Completely unpadded execution
				250	this->compute_tiles_unpadded(
				251	start_output_i, start_output_j,
				252	1, n_unpadded_tiles, // Compute a row of unpadded tiles
				253	0, n_channels, // Compute all channels
				254	input_tensor, output_tensor, thread_working_space
				255	);
				256	}
				257	else
				258	{
				259	// Top/bottom padding only
				260	this->compute_row_padded_tile_row(
				261	start_output_i, start_output_j, n_unpadded_tiles,
				262	0, n_channels, // Compute all channels
				263	input_tensor, output_tensor, thread_working_space
				264	);
				265	}
				266	start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
				267	}
				268	else
				269	{
				270	this->compute_tile_padded(
				271	start_output_i, start_output_j,
				272	0, n_channels, // Compute all channels
				273	input_tensor, output_tensor, thread_working_space
				274	);
				275	start_output_j += m_strat->get_output_cols();
				276	}
				277	}
				278	}
				279
				280	// Progress the pointers for the next batch.
				281	input_tensor.base += ld_input_batch;
				282	output_tensor.base += ld_output_batch;
				283	}
				284	}
				285
				286	public:
				287	DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args)
				288	: Parent(args), m_strat(strategy)
				289	{
				290	}
				291
				292	size_t get_working_size(unsigned int n_threads) const override
				293	{
				294	return this->get_working_size(n_threads, this->m_args.n_channels);
				295	}
				296
				297	size_t get_working_size(unsigned int n_threads, unsigned int n_channels) const override final
				298	{
				299	return n_threads * this->get_working_size_per_thread(n_channels);
				300	}
				301	};
				302
				303	} // namespace pooling
				304	} // namespace arm_conv