Blame - src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp - ml/ComputeLibrary

blob: ad95207fb34c4c24eb7b9861ded0f27de4431821 [file] [log] [blame]

Michele Di Giorgio	d556d7b	2020-10-27 10:56:31 +0000	[diff] [blame^]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
				27	#include "pool_common.hpp"
				28	#include "utils.hpp"
				29
				30	#include "arm_compute/core/Types.h"
				31	#include <limits>
				32
				33	namespace arm_conv {
				34	namespace pooling {
				35
				36	template <class strategy>
				37	class PoolingDepthfirst : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
				38	{
				39	using TInput = typename strategy::operand_type;
				40	using TOutput = typename strategy::return_type;
				41
				42	const PoolingArgs m_args; // Copy of arguments
				43
				44	constexpr static unsigned int input_rows(void)
				45	{
				46	return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
				47	}
				48
				49	constexpr static unsigned int input_cols(void)
				50	{
				51	return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
				52	}
				53
				54	size_t sizeof_input_buffer(void) const
				55	{
				56	return sizeof(TInput) * m_args.n_channels;
				57	}
				58
				59	size_t sizeof_output_buffer(void) const
				60	{
				61	return sizeof(TOutput) * m_args.n_channels;
				62	}
				63
				64	public:
				65	PoolingDepthfirst(const PoolingArgs &args) : m_args(args)
				66	{
				67	}
				68
				69	PoolingDepthfirst(PoolingDepthfirst &) = delete;
				70	PoolingDepthfirst &operator=(PoolingDepthfirst &) = delete;
				71
				72	size_t get_working_size(unsigned int num_threads) const override
				73	{
				74	// We require a channel-length vector of input padding values
				75	// (to be shared amongst all threads) and (for each thread) a
				76	// channel-length vector in which to dump surplus output.
				77	return sizeof_input_buffer() + num_threads * sizeof_output_buffer();
				78	}
				79
				80	void execute(
				81	const void *const input,
				82	void *const output,
				83	void *const working_space,
				84	unsigned int thread_id,
				85	unsigned int num_threads
				86	) const override
				87	{
				88	const size_t ld_input_col = m_args.n_channels;
				89	const size_t ld_input_row = ld_input_col * m_args.input_cols;
				90	const size_t ld_input_batch = ld_input_row * m_args.input_rows;
				91	const size_t ld_output_col = ld_input_col;
				92	const size_t ld_output_row = ld_output_col * m_args.output_cols;
				93	const size_t ld_output_batch = ld_output_row * m_args.output_rows;
				94
				95	execute(
				96	input, ld_input_col, ld_input_row, ld_input_batch,
				97	output, ld_output_col, ld_output_row, ld_output_batch,
				98	working_space,
				99	thread_id, num_threads
				100	);
				101	}
				102
				103	void execute(
				104	const void *const input,
				105	size_t ld_input_col,
				106	size_t ld_input_row,
				107	size_t ld_input_batch,
				108	void *const output,
				109	size_t ld_output_col,
				110	size_t ld_output_row,
				111	size_t ld_output_batch,
				112	void *const working_space,
				113	unsigned int thread_id,
				114	unsigned int num_threads
				115	) const override
				116	{
				117	execute(
				118	m_args.n_batches, m_args.input_rows, m_args.input_cols,
				119	m_args.n_channels,
				120	input, ld_input_col, ld_input_row, ld_input_batch,
				121	m_args.padding,
				122	m_args.output_rows, m_args.output_cols,
				123	output, ld_output_col, ld_output_row, ld_output_batch,
				124	working_space,
				125	thread_id, num_threads
				126	);
				127	}
				128
				129	void execute(
				130	unsigned int batches,
				131	unsigned int height,
				132	unsigned int width,
				133	unsigned int channels,
				134	const void *const _input,
				135	size_t ld_input_col,
				136	size_t ld_input_row,
				137	size_t ld_input_batch,
				138	const PaddingValues &padding,
				139	unsigned int output_height,
				140	unsigned int output_width,
				141	void *const _output,
				142	size_t ld_output_col,
				143	size_t ld_output_row,
				144	size_t ld_output_batch,
				145	void *const _working_space,
				146	unsigned int thread_id,
				147	unsigned int num_threads
				148	) const override
				149	{
				150	ARM_COMPUTE_UNUSED(batches, ld_input_batch, ld_output_batch);
				151	strategy strat(m_args.cpu_info);
				152	#ifdef CYCLE_PROFILING
				153	arm_gemm::profiler prof;
				154	#endif // CYCLE_PROFILING
				155
				156	// Cast input and output pointers into the right types
				157	const TInput const inptr = static_cast<const TInput >(_input);
				158	TOutput const outptr = static_cast<TOutput >(_output);
				159
				160	const unsigned int roundup_output_rows = roundup(output_height, num_threads);
				161	const unsigned int rows_per_thread = roundup_output_rows / num_threads;
				162	const int start_out_height = static_cast<int>(thread_id * rows_per_thread);
				163	const int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
				164
				165	// Create an array for the input pointers
				166	const TInput * _inptr_array[input_rows() * input_cols()];
				167	const TInput **const inptr_array = _inptr_array;
				168
				169	// Create an array for the output pointers
				170	TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
				171	TOutput **const outptr_array = _outptr_array;
				172
				173	// Allocate portions of the working space
				174	uint8_t const working_space = static_cast<uint8_t >(_working_space);
				175	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space + thread_id * sizeof_output_buffer());
				176	TInput const input_buffer = reinterpret_cast<TInput >(working_space + num_threads * sizeof_output_buffer());
				177
				178	// Initialise the input buffer
				179	for (unsigned int c = 0; c < channels; c++)
				180	{
				181	TInput &val = input_buffer[c];
				182
				183	if (strategy::pooling_type() == PoolingType::AVERAGE)
				184	{
				185	val = static_cast<TInput>(0);
				186	}
				187	else if (strategy::pooling_type() == PoolingType::MAX)
				188	{
				189	#if defined(__aarch64__)
				190	using InputType = typename std::conditional<std::is_same<TInput, __fp16>::value, arm_compute::half, TInput>::type;
				191	using limits = std::numeric_limits<InputType>;
				192	#else // defined(__aarch64__)
				193	using limits = std::numeric_limits<TInput>;
				194	#endif // defined(__aarch64__)
				195	if (limits::has_infinity)
				196	{
				197	val = -limits::infinity();
				198	}
				199	else
				200	{
				201	val = limits::min();
				202	}
				203	}
				204	}
				205
				206	// For each output tile, construct the requisite set of pointers and call
				207	// into the kernel.
				208	for (unsigned int batch = 0; batch < batches; batch++)
				209	{
				210	// Get batch pointers
				211	const auto inptr_batch = inptr + batch * ld_input_batch;
				212	const auto outptr_batch = outptr + batch * ld_output_batch;
				213
				214	for (int start_out_i = start_out_height;
				215	start_out_i < end_out_height;
				216	start_out_i += static_cast<int>(strategy::out_rows()))
				217	{
				218	const int end_out_i = start_out_i + strategy::out_rows();
				219	const int start_in_i = start_out_i * strategy::stride_rows() - padding.top;
				220	const int end_in_i = start_in_i + input_rows();
				221
				222	// Compute top/bottom padding - TODO Is this right for average pooling?
				223	const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
				224	const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
				225	const unsigned int valid_output_rows = std::min(
				226	end_out_i - start_out_i,
				227	static_cast<int>(end_out_height) - start_out_i
				228	);
				229
				230	// Fill the input pointer array with padding values
				231	for (auto index = 0u; index < input_rows() * input_cols(); index++)
				232	{
				233	inptr_array[index] = input_buffer;
				234	}
				235
				236	for (int start_out_j = 0, start_in_j = -padding.left;
				237	start_out_j < static_cast<int>(output_width);
				238	start_out_j += static_cast<int>(strategy::out_cols()),
				239	start_in_j += static_cast<int>(strategy::out_cols()) * strategy::stride_cols())
				240	{
				241	const int end_out_j = start_out_j + strategy::out_cols();
				242	const int end_in_j = start_in_j + input_cols();
				243
				244	// Compute left/right padding - TODO Is this right for average pooling?
				245	const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
				246	const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
				247
				248	const unsigned int valid_output_cols = std::min(
				249	end_out_j - start_out_j,
				250	static_cast<int>(output_width) - start_out_j
				251	);
				252
				253	// Construct the input pointer array - fill the array with pointers to
				254	// the input buffer and then fill in the required values.
				255	for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
				256	{
				257	// Can skip over the left padding because we will have either the
				258	// same or less than the previous tile.
				259	unsigned int j = pad_left;
				260	const TInput colptr = inptr_batch + (start_in_i + i) ld_input_row + (start_in_j + j) * ld_input_col;
				261	const TInput *ptrs = inptr_array + i input_cols() + j;
				262	for (; j < input_cols() - pad_right; j++)
				263	{
				264	*(ptrs++) = colptr;
				265	colptr += ld_input_col;
				266	}
				267	for (; j < input_cols(); j++)
				268	{
				269	*(ptrs++) = input_buffer;
				270	}
				271	}
				272
				273	// Construct the output pointer array.
				274	TOutput **outptr_pos = outptr_array;
				275	for (auto i = 0u; i < valid_output_rows; i++)
				276	{
				277	unsigned int j = 0u;
				278	TOutput colptr = outptr_batch + (start_out_i + i) ld_output_row + start_out_j * ld_output_col;
				279	for (; j < valid_output_cols; j++)
				280	{
				281	*(outptr_pos++) = colptr;
				282	colptr += ld_output_col;
				283	}
				284	for (; j < strategy::out_cols(); j++)
				285	{
				286	*(outptr_pos++) = output_buffer;
				287	}
				288	}
				289	for (auto i = valid_output_rows; i < strategy::out_rows(); i++)
				290	{
				291	for (auto j = 0u; j < strategy::out_cols(); j++)
				292	{
				293	*(outptr_pos++) = output_buffer;
				294	}
				295	}
				296
				297	#ifdef CYCLE_PROFILING
				298	// TODO Work number
				299	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
				300	#endif
				301	strat.kernel(
				302	channels, inptr_array, outptr_array,
				303	m_args.exclude_padding, pad_left, pad_top, pad_right, pad_bottom
				304	);
				305	}
				306	}
				307	}
				308	}
				309	};
				310
				311	} // namespace pooling
				312	} // namespace arm_conv