Blame - src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp - ml/ComputeLibrary

blob: 07c582059f8a2a609348de2b0b28a96b3edce558 [file] [log] [blame]

Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	1	/*
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	2	* Copyright (c) 2021-2023 Arm Limited.
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	27	#include "depthfirst_driver.hpp"
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	28	#include "utils.hpp"
Pablo Marquez Tello	84a0941	2022-06-30 17:00:09 +0100	[diff] [blame]	29	#if !defined(_WIN64) && !defined(__OpenBSD__)
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	30	#include <alloca.h>
Pablo Marquez Tello	84a0941	2022-06-30 17:00:09 +0100	[diff] [blame]	31	#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	32
				33	namespace arm_conv {
				34	namespace pooling {
				35
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	36	template <typename TInput, typename TOutput, typename OutputStage = Nothing>
				37	class IGenericDepthfirstStrategy;
				38
				39	template <typename TInput, typename TOutput>
				40	class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	41	{
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	42	public:
				43	virtual ~IGenericDepthfirstStrategy() = default;
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	44
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	45	typedef void (*KernelType)(
				46	uint64_t window_cells,
				47	uint64_t n_valid_cells,
				48	uint64_t n_channels,
				49	const TInput const ,
				50	TOutput *
				51	);
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	52
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	53	virtual KernelType get_kernel(void) const = 0;
				54	};
				55
				56	template <typename TInput, typename TOutput>
				57	class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
				58	{
				59	public:
				60	virtual ~IGenericDepthfirstStrategy() = default;
				61
				62	typedef void (*KernelType)(
				63	uint64_t window_cells,
				64	uint64_t n_valid_cells,
				65	uint64_t n_channels,
				66	const TInput const ,
				67	TOutput *,
				68	const Requantize32 &
				69	);
				70
				71	virtual KernelType get_kernel(void) const = 0;
				72	};
				73
				74	template <typename TInput, typename TOutput, typename OutputStage>
				75	struct Invoker;
				76
				77	template <typename TInput, typename TOutput>
				78	struct Invoker<TInput, TOutput, Nothing>
				79	{
				80	static inline void invoke(
				81	const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
				82	uint64_t window_cells,
				83	uint64_t n_valid_cells,
				84	uint64_t n_channels,
				85	const TInput const inptrs,
				86	TOutput *outptr,
				87	const Nothing &
				88	)
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	89	{
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	90	kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
				91	}
				92	};
				93
				94	template <typename TInput, typename TOutput>
				95	struct Invoker<TInput, TOutput, Requantize32>
				96	{
				97	static inline void invoke(
				98	const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
				99	uint64_t window_cells,
				100	uint64_t n_valid_cells,
				101	uint64_t n_channels,
				102	const TInput const inptrs,
				103	TOutput *outptr,
				104	const Requantize32 &qp
				105	)
				106	{
				107	kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
				108	}
				109	};
				110
				111	template <typename TInput, typename TOutput, typename OutputStage>
				112	class GenericDepthfirstWrapper : public IDepthfirstStrategy
				113	{
				114	using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
				115
				116	std::unique_ptr<const StratType> m_strat;
				117	const unsigned int window_rows, window_cols;
				118
				119	public:
				120	GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
				121	: m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
				122	{
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	123	}
				124
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	125	unsigned int get_input_rows(void) const override { return window_rows; }
				126	unsigned int get_input_cols(void) const override { return window_cols; }
				127	unsigned int get_output_rows(void) const override { return 1; }
				128	unsigned int get_output_cols(void) const override { return 1; }
				129
				130	typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
				131	};
				132
				133	template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
				134	class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
				135	{
				136	const OutputStage m_os;
				137
				138	protected:
				139	size_t get_working_size_per_thread(unsigned int) const override { return 0; }
				140	void initialise_working_space(void , unsigned int) const override { / Nothing */ }
				141
				142	/* Compute a portion of the output tensor with padding. */
				143	void compute_tile_padded(
				144	unsigned int output_i, unsigned int output_j,
				145	unsigned int channel_start, unsigned int channel_end,
				146	const TensorSpec<const TInput *> &input,
				147	const TensorSpec<TOutput *> &output,
				148	void *
				149	) const override
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	150	{
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	151	// Determine start position and padding
				152	const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
				153	const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
				154	const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
				155	const int end_i = start_i + this->m_args.pool_window.rows;
				156	const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
				157	const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
				158
				159	const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
				160	const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
				161	const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
				162	const int end_j = start_j + this->m_args.pool_window.cols;
				163	const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
				164	const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
				165
				166	// Determine the number of valid cells and prepare the pointers
				167	const auto n_valid_cells = valid_rows * valid_cols;
				168	auto inptrs = reinterpret_cast<const TInput *>(alloca(n_valid_cells sizeof(TInput *)));
				169	{
				170	auto my_ptr = inptrs;
				171	auto row_ptr = input.base + input_iinput.ld_row + input_jinput.ld_col + channel_start;
				172	for (auto i = valid_rows; i; i--)
				173	{
				174	auto ptr = row_ptr;
				175	row_ptr += input.ld_row;
				176
				177	for (auto j = valid_cols; j; j--)
				178	{
				179	*(my_ptr++) = ptr;
				180	ptr += input.ld_col;
				181	}
				182	}
				183	}
				184
				185	auto outptr = output.base + output_ioutput.ld_row + output_joutput.ld_col + channel_start;
				186
				187	// Some padding variants include (or exclude) the padding values; we handle
				188	// this by computing the extent of the padded input tensor and hence
				189	// computing the total number of cells captured in the pooling window.
				190	const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
				191	const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
				192	const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
				193	const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
				194	const auto captured_cells = captured_rows * captured_cols;
				195	const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
				196
				197	// Execute the kernel
				198	Invoker<TInput, TOutput, OutputStage>::invoke(
				199	reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
				200	window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
				201	);
				202	}
				203
				204	// Compute a portion of the work with only top/bottom padding.
				205	void compute_row_padded_tile_row(
				206	const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
				207	const unsigned int channel_start, const unsigned int channel_end,
				208	const TensorSpec<const TInput *> &input,
				209	const TensorSpec<TOutput *> &output,
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	210	void *
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	211	) const override
				212	{
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	213	// Determine start position and padding
				214	const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
				215	const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
				216	const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
				217	const int end_i = start_i + this->m_args.pool_window.rows;
				218	const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
				219	const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
				220
				221	const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
				222	const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
				223	const auto valid_cols = this->m_args.pool_window.cols;
				224
				225	// Determine the number of valid cells and prepare the pointers
				226	const auto n_valid_cells = valid_rows * valid_cols;
				227	auto inptrs = reinterpret_cast<const TInput *>(alloca(n_valid_cells sizeof(TInput *)));
				228	{
				229	auto my_ptr = inptrs;
				230	auto row_ptr = input.base + input_iinput.ld_row + input_jinput.ld_col + channel_start;
				231	for (auto i = valid_rows; i; i--)
				232	{
				233	auto ptr = row_ptr;
				234	row_ptr += input.ld_row;
				235
				236	for (auto j = valid_cols; j; j--)
				237	{
				238	*(my_ptr++) = ptr;
				239	ptr += input.ld_col;
				240	}
				241	}
				242	}
				243
				244	auto outptr = output.base + output_ioutput.ld_row + output_joutput.ld_col + channel_start;
				245
				246	// Some padding variants include (or exclude) the padding values; we handle
				247	// this by computing the extent of the padded input tensor and hence
				248	// computing the total number of cells captured in the pooling window.
				249	const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
				250	const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
				251	const auto captured_cells = captured_rows * valid_cols;
				252	const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
				253
				254	for (; n_tile_cols; n_tile_cols--)
				255	{
				256	// Execute the kernel
				257	Invoker<TInput, TOutput, OutputStage>::invoke(
				258	reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
				259	window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
				260	);
				261
				262	// Update the pointers; the output strides by a column and the inputs
				263	// stride by a number of columns.
				264	outptr += output.ld_col;
				265	for (auto n = 0u; n < n_valid_cells; n++)
				266	{
				267	inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
				268	}
				269	}
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	270	}
				271
				272	public:
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	273	PoolingDepthfirstGeneric(
				274	const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
				275	const PoolingArgs &args,
				276	const OutputStage &os = {}
				277	)
				278	: DepthfirstDriver<TInput, TOutput>(
				279	new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
				280	args
				281	),
				282	m_os(os)
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	283	{
				284	}
Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	285	};
				286
				287	} // namespace pooling
				288	} // namespace arm_conv