Blame - src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp - ml/ComputeLibrary

blob: 592ee7282006c2ca40034754067091c6d2f46505 [file] [log] [blame]

ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	1	/*
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	2	* Copyright (c) 2022-2023 Arm Limited.
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	27	#include "depthwise.hpp"
				28	#include "utils.hpp"
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	29
				30	namespace arm_conv {
				31	namespace depthwise {
				32
				33	template <typename T> struct DefaultTAccum { using Type = T; };
				34	template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
				35	template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
				36
				37	template <typename T> struct DefaultOutputStage { using Type = Nothing; };
				38	template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
				39	template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
				40
				41	class IDepthfirstStrategy
				42	{
				43	public:
				44	virtual ~IDepthfirstStrategy() = default;
				45
				46	virtual unsigned int get_input_rows() const = 0;
				47	virtual unsigned int get_input_cols() const = 0;
				48
				49	virtual unsigned int get_output_rows() const = 0;
				50	virtual unsigned int get_output_cols() const = 0;
				51	};
				52
				53
				54	template <typename T>
				55	struct TensorSpec
				56	{
				57	T base;
				58	size_t ld_row, ld_col;
				59
				60	TensorSpec(T ptr, size_t ld_row, size_t ld_col)
				61	: base(ptr), ld_row(ld_row), ld_col(ld_col) {}
				62	};
				63
				64
				65	template <typename TInput, typename TWeight, typename TOutput>
				66	class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
				67	{
				68	protected:
				69	using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
				70
				71	// The strategy which we're applying to solve the depthwise convolution.
				72	std::unique_ptr<const IDepthfirstStrategy> m_strat;
				73
				74	/* Compute the amount of working space required for a single thread. */
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	75	virtual size_t get_working_size_per_thread() const = 0;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	76
				77	/* Initialise the working space for a thread. */
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	78	virtual void initialise_working_space(void *) const = 0;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	79
				80	/* Compute a portion of the output tensor with padding. */
				81	virtual void compute_tile_padded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	82	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	83	unsigned int output_i, unsigned int output_j,
				84	unsigned int output_channel_start, unsigned int output_channel_end,
				85	const TensorSpec<const TInput *> &input,
				86	const TensorSpec<TOutput *> &output,
				87	const void *parameters,
				88	void *working_space
				89	) const = 0;
				90
				91	/* Compute a portion of the work with only top/bottom padding.
				92	*
				93	* The default implementation of this repeatedly calls into the padded tile
				94	* variant.
				95	*/
				96	virtual void compute_row_padded_tile_row(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	97	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	98	const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
				99	const unsigned int output_channel_start, const unsigned int output_channel_end,
				100	const TensorSpec<const TInput *> &input,
				101	const TensorSpec<TOutput *> &output,
				102	const void *parameters,
				103	void *working_space
				104	) const
				105	{
				106	for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
				107	{
				108	this->compute_tile_padded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	109	args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	110	output_i, output_j, output_channel_start, output_channel_end,
				111	input, output, parameters, working_space
				112	);
				113	}
				114	}
				115
				116	/* Compute a portion of the output tensor with no padding.
				117	*
				118	* The default implementation of this repeatedly calls into the padded
				119	* variant.
				120	*/
				121	virtual void compute_tiles_unpadded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	122	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	123	unsigned int start_output_i, unsigned int start_output_j,
				124	unsigned int n_tile_rows, unsigned int n_tile_cols,
				125	unsigned int output_channel_start, unsigned int output_channel_end,
				126	const TensorSpec<const TInput *> &input,
				127	const TensorSpec<TOutput *> &output,
				128	const void *parameters,
				129	void *working_space
				130	) const
				131	{
				132	for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
				133	{
				134	unsigned int row_start_output_j = start_output_j;
				135	for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
				136	{
				137	this->compute_tile_padded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	138	args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	139	start_output_i, row_start_output_j,
				140	output_channel_start, output_channel_end,
				141	input, output, parameters, working_space
				142	);
				143	row_start_output_j += m_strat->get_output_cols();
				144	}
				145	start_output_i += m_strat->get_output_rows();
				146	}
				147	}
				148
				149	void execute_internal(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	150	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	151	const void *input,
				152	size_t ld_input_col,
				153	size_t ld_input_row,
				154	size_t ld_input_batch,
				155	const void *parameters,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	156	void *output,
				157	size_t ld_output_col,
				158	size_t ld_output_row,
				159	size_t ld_output_batch,
				160	void *working_space,
				161	unsigned int thread_id,
				162	unsigned int n_threads
				163	) const override
				164	{
				165	// Get and initialise the working space for this thread.
				166	void *thread_working_space =
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	167	static_cast<uint8_t >(working_space) + thread_id this->get_working_size_per_thread();
				168	this->initialise_working_space(thread_working_space);
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	169
				170	// Construct convenient representations of the input/output tensors.
				171	TensorSpec<const TInput > input_tensor(reinterpret_cast<const TInput >(input), ld_input_row, ld_input_col);
				172	TensorSpec<TOutput > output_tensor(reinterpret_cast<TOutput >(output), ld_output_row, ld_output_col);
				173
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	174	const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	175
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	176	for (unsigned int batch = 0; batch < args.n_batches; batch++)
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	177	{
				178	// Iterate over rows of the output tensor; we stripe over the tiles.
				179	for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	180	start_output_i < args.output_rows;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	181	start_output_i += n_threads * m_strat->get_output_rows())
				182	{
				183	// Determine what (if any padding) is required on the top/bottom of
				184	// this row of the convolution.
				185	const auto end_output_i = start_output_i + m_strat->get_output_rows();
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	186	const bool pad_output_bottom = args.output_rows < end_output_i;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	187
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	188	const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	189	const bool pad_input_top = start_input_i < 0;
				190	const int end_input_i = start_input_i + m_strat->get_input_rows();
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	191	const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	192	// We only need to account for input padding if direct padding is not supported.
				193	const bool pad_row = ((pad_input_top \|\| pad_input_bottom) && !this->supports_direct_padding())
				194	\|\| pad_output_bottom;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	195
				196	// Iterate over the columns of the output tensor; we attempt to grab as
				197	// much as possible of the unpadded regions, so the loop structure is a
				198	// bit odd.
				199	unsigned int start_output_j = 0;
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	200	while (start_output_j < args.output_cols)
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	201	{
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	202	const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	203	const bool pad_input_left = start_in_j < 0;
				204
				205	// Determine if we can process a number of unpadded tiles in one go.
				206	int n_unpadded_tiles = 0;
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	207	if ((!pad_input_left) \|\| this->supports_direct_padding())
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	208	{
				209	// Determine the maximum number of tiles we could handle.
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	210	n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	211
				212	// Handle padding on the right hand edge
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	213	const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	214	int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
				215	int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
				216
				217	while (n_unpadded_tiles > 0 &&
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	218	(static_cast<int>(args.output_cols) < end_output_j \|\|
				219	static_cast<int>(args.input_cols) < end_input_j))
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	220	{
				221	n_unpadded_tiles--;
				222	end_output_j -= m_strat->get_output_cols();
				223	end_input_j -= tile_stride;
				224	}
				225	}
				226
				227	// Process unpadded tiles, if possible, otherwise process a padded tile.
				228	if (n_unpadded_tiles)
				229	{
				230	if (!pad_row)
				231	{
				232	// Completely unpadded execution
				233	this->compute_tiles_unpadded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	234	args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	235	start_output_i, start_output_j,
				236	1, n_unpadded_tiles, // Compute a row of unpadded tiles
				237	0, n_output_channels, // Compute all channels
				238	input_tensor, output_tensor, parameters, thread_working_space
				239	);
				240	}
				241	else
				242	{
				243	// Top/bottom padding only
				244	this->compute_row_padded_tile_row(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	245	args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	246	start_output_i, start_output_j, n_unpadded_tiles,
				247	0, n_output_channels, // Compute all channels
				248	input_tensor, output_tensor, parameters, thread_working_space
				249	);
				250	}
				251	start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
				252	}
				253	else
				254	{
				255	this->compute_tile_padded(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	256	args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	257	start_output_i, start_output_j,
				258	0, n_output_channels, // Compute all channels
				259	input_tensor, output_tensor, parameters, thread_working_space
				260	);
				261	start_output_j += m_strat->get_output_cols();
				262	}
				263	}
				264	}
				265
				266	// Progress the pointers for the next batch.
				267	input_tensor.base += ld_input_batch;
				268	output_tensor.base += ld_output_batch;
				269	}
				270	}
				271
				272	public:
				273	DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
				274	: Parent(args), m_strat(strategy)
				275	{
				276	}
				277
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	278	size_t get_working_size(unsigned int n_threads) const override final
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	279	{
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame]	280	return n_threads * this->get_working_size_per_thread();
				281	}
				282
				283	virtual bool supports_direct_padding() const
				284	{
				285	return false;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	286	}
				287	};
				288
				289	} // namespace depthwise
				290	} // namespace arm_conv