Blame - src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp - ml/ComputeLibrary

blob: 4aabd957cd35c977ab7c4743d65ff2f4563b562e [file] [log] [blame]

Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#pragma once
				25
				26	#include "pool_common.hpp"
				27
				28	#include <stack>
				29	#include <vector>
				30
				31	namespace arm_conv {
				32	namespace pooling {
				33
				34	template <class strategy>
				35	class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
				36	{
				37	using TInput = typename strategy::operand_type;
				38	using TOutput = typename strategy::return_type;
				39
				40	const PoolingArgs m_args; // Copy of arguments
				41
				42	constexpr static unsigned int input_rows(void)
				43	{
				44	return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
				45	}
				46
				47	constexpr static unsigned int input_cols(void)
				48	{
				49	return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
				50	}
				51
				52	size_t sizeof_input_buffer(void) const
				53	{
				54	return sizeof(TInput) * m_args.n_channels;
				55	}
				56
				57	size_t sizeof_output_buffer(void) const
				58	{
				59	return sizeof(TOutput) * m_args.n_channels;
				60	}
				61
				62	public:
				63	PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
				64	{
				65	}
				66
				67	PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
				68	PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
				69
				70	size_t get_working_size(void) const override
				71	{
				72	// We require an array of pointers for the inputs and outputs, a
				73	// channel-length vector in which to dump surplus output, and a
				74	// channel-length vector of padding values.
				75	return sizeof_input_buffer() + sizeof_output_buffer();
				76	}
				77
				78	void execute(
				79	const void *const input,
				80	void *const output,
				81	void *const working_space
				82	) const override
				83	{
				84	const size_t ld_input_col = m_args.n_channels;
				85	const size_t ld_input_row = ld_input_col * m_args.input_cols;
				86	const size_t ld_input_batch = ld_input_row * m_args.input_rows;
				87	const size_t ld_output_col = ld_input_col;
				88	const size_t ld_output_row = ld_output_col * m_args.output_cols;
				89	const size_t ld_output_batch = ld_output_row * m_args.output_rows;
				90
				91	execute(
				92	input, ld_input_col, ld_input_row, ld_input_batch,
				93	output, ld_output_col, ld_output_row, ld_output_batch,
				94	working_space
				95	);
				96	}
				97
				98	void execute(
				99	const void *const input,
				100	size_t ld_input_col,
				101	size_t ld_input_row,
				102	size_t ld_input_batch,
				103	void *const output,
				104	size_t ld_output_col,
				105	size_t ld_output_row,
				106	size_t ld_output_batch,
				107	void *const working_space
				108	) const override
				109	{
				110	execute(
				111	m_args.n_batches, m_args.input_rows, m_args.input_cols,
				112	m_args.n_channels,
				113	input, ld_input_col, ld_input_row, ld_input_batch,
				114	m_args.padding,
				115	m_args.output_rows, m_args.output_cols,
				116	output, ld_output_col, ld_output_row, ld_output_batch,
				117	working_space
				118	);
				119	}
				120
				121	void execute(
				122	unsigned int batches,
				123	unsigned int input_height,
				124	unsigned int input_width,
				125	unsigned int channels,
				126	const void *const _input,
				127	size_t ld_input_col,
				128	size_t ld_input_row,
				129	size_t ld_input_batch,
				130	const PaddingValues &padding,
				131	unsigned int output_height,
				132	unsigned int output_width,
				133	void *const _output,
				134	size_t ld_output_col,
				135	size_t ld_output_row,
				136	size_t ld_output_batch,
				137	void *const _working_space
				138	) const override
				139	{
				140	strategy strat(m_args.cpu_info);
				141	#ifdef CYCLE_PROFILING
				142	arm_gemm::profiler prof;
				143	#endif // CYCLE_PROFILING
				144
				145	// Cast input and output pointers into the right types
				146	const TInput const inptr = static_cast<const TInput >(_input);
				147	TOutput const outptr = static_cast<TOutput >(_output);
				148
				149	// Allocate portions of the working space
				150	uint8_t const working_space = static_cast<uint8_t >(_working_space);
				151	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space);
				152	TInput const input_buffer = reinterpret_cast<TInput >(working_space + sizeof_output_buffer());
				153
				154	// Fill the input buffer
				155	const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
				156	? static_cast<TInput>(0)
				157	: (std::numeric_limits<TInput>::has_infinity
				158	? -std::numeric_limits<TInput>::infinity()
				159	: std::numeric_limits<TInput>::lowest());
				160	for (unsigned int i = 0; i < channels; i++)
				161	{
				162	input_buffer[i] = pad_value;
				163	}
				164
				165	// Keep subdividing the output plane across the longest dimension until we
				166	// reach the size of the tile. Queue items for later processing. Note - we
				167	// can determine the largest size of the queue a priori from the input
				168	// tensor size, this would allow us to allocate memory within the working
				169	// space and improve performance.
				170	struct WorkItem
				171	{
				172	unsigned int output_i, output_j;
				173	unsigned int output_height, output_width;
				174
				175	WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
				176	: output_i(i), output_j(j), output_height(height), output_width(width) {}
				177	};
				178
				179	auto execute = [&] (const WorkItem &item) {
				180	// Create an array for the output pointers
				181	TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
				182	TOutput **const outptr_array = _outptr_array;
				183
				184	// Construct the output pointer array
				185	{
				186	const auto output_pad_right = strategy::out_rows() - item.output_width;
				187	auto outptr_element = outptr_array;
				188	auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
				189
				190	// Fill the array with pointers to the output buffer
				191	for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
				192	{
				193	outptr_array[i] = output_buffer;
				194	}
				195
				196	// Fill in the valid portion of the array
				197	for (unsigned int i = 0; i < item.output_height; i++)
				198	{
				199	auto outptr_col = outptr_row;
				200	for (unsigned int j = 0; j < item.output_width; j++)
				201	{
				202	*(outptr_element++) = outptr_col;
				203	outptr_col += ld_output_col;
				204	}
				205	outptr_element += output_pad_right;
				206	outptr_row += ld_output_row;
				207	}
				208	}
				209
				210	const int start_i = item.output_i * strategy::stride_rows() - padding.top;
				211	const int end_i = start_i + input_rows();
				212	const unsigned int pad_top = std::max(0, 0 - start_i);
				213	const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
				214
				215	const int start_j = item.output_j * strategy::stride_cols() - padding.left;
				216	const int end_j = start_j + input_cols();
				217	const unsigned int pad_left = std::max(0, 0 - start_j);
				218	const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
				219
				220	// Create an array for the input pointers
				221	const TInput * _inptr_array[input_rows() * input_cols()];
				222	const TInput **const inptr_array = _inptr_array;
				223	{
				224	const unsigned int row_padding = pad_top + pad_bottom;
				225	const unsigned int valid_rows = input_rows() - row_padding;
				226
				227	const unsigned int col_padding = pad_left + pad_right;
				228	const unsigned int valid_cols = input_cols() - col_padding;
				229
				230	// Fill the array with pointers to the input buffer
				231	for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
				232	{
				233	inptr_array[i] = input_buffer;
				234	}
				235
				236	// Compute valid initial pointer
				237	auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
				238
				239	// Fill in the valid portion of the input array
				240	auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
				241	for (unsigned int i = 0; i < valid_rows; i++)
				242	{
				243	auto inptr_col = inptr_row;
				244	for (unsigned int j = 0; j < valid_cols; j++)
				245	{
				246	*(inptr_element++) = inptr_col;
				247	inptr_col += ld_input_col;
				248	}
				249
				250	inptr_row += ld_input_row;
				251	inptr_element += col_padding; // Skip the padding elements
				252	}
				253	}
				254
				255	// Call the kernel
				256	#ifdef CYCLE_PROFILING
				257	// TODO Work number
				258	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
				259	#endif // CYCLE_PROFILING
				260	strat.kernel(channels, inptr_array, outptr_array,
				261	pad_left, pad_top, pad_right, pad_bottom);
				262	};
				263
				264	// Add the initial work item to the stack of work.
				265	std::stack<WorkItem, std::vector<WorkItem>> stack;
				266	stack.push(WorkItem(0, 0, output_height, output_width));
				267	while (!stack.empty())
				268	{
				269	// Pop an item from the stack, bisect the largest dimension and either
				270	// execute the resulting tiles or add them to the stack if they are too
				271	// large.
				272	const WorkItem item(stack.top());
				273	stack.pop();
				274
				275	if (item.output_height <= strategy::out_rows() &&
				276	item.output_width <= strategy::out_cols())
				277	{
				278	execute(item);
				279	}
				280	else
				281	{
				282	// Split the largest dimension, such that we get an exact number of
				283	// tiles in the first partition.
				284	if (item.output_height >= item.output_width)
				285	{
				286	const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
				287	const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
				288
				289	const unsigned int height_first = tiles_first * strategy::out_rows();
				290	const unsigned int height_second = item.output_height - height_first;
				291
				292	stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
				293	stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
				294	}
				295	else
				296	{
				297	const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
				298	const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
				299
				300	const unsigned int width_first = tiles_first * strategy::out_cols();
				301	const unsigned int width_second = item.output_width - width_first;
				302
				303	stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
				304	stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
				305	}
				306	}
				307	}
				308	}
				309	};
				310
				311	} // namespace pooling
				312	} // namespace arm_conv