Blame - src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp - ml/ComputeLibrary

blob: 2862361b829b15c6b64af340a459c138974ff2fe [file] [log] [blame]

Michele Di Giorgio	d02d5ed	2021-01-22 09:47:04 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
				27	#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
				28
				29	#ifdef CYCLE_PROFILING
				30	#include "profiler.hpp"
				31	#endif
				32
Michele Di Giorgio	0f033df	2021-07-16 15:00:08 +0100	[diff] [blame^]	33	#include <limits>
				34
Michele Di Giorgio	d02d5ed	2021-01-22 09:47:04 +0000	[diff] [blame]	35	namespace arm_conv {
				36	namespace depthwise {
				37
				38	namespace common
				39	{
				40	template <typename strategy, typename F>
				41	void depthwise_multiplier_execute(
				42	const F execute_tile,
				43	typename strategy::input_type pad_value,
				44	const DepthwiseArgs &args,
				45	const unsigned int batches,
				46	const unsigned int input_height,
				47	const unsigned int input_width,
				48	const unsigned int input_channels,
				49	const PaddingValues &padding,
				50	const void *const _input,
				51	const size_t ld_input_col,
				52	const size_t ld_input_row,
				53	const size_t ld_input_batch,
				54	const void *const parameters,
				55	const size_t param_stride,
				56	const unsigned int output_height,
				57	const unsigned int output_width,
				58	void *const _output,
				59	const size_t ld_output_col,
				60	const size_t ld_output_row,
				61	const size_t ld_output_batch,
				62	void *const _working_space,
				63	const unsigned int thread_id,
				64	const unsigned int n_threads
				65	)
				66	{
				67	using TInput = typename strategy::input_type;
				68	using TOutput = typename strategy::return_type;
				69
				70	// Determine what portion of the work to do.
				71	const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
				72	const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
				73	const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
				74
				75	// Cast input and output pointers into the right types
				76	const TInput const inptr = static_cast<const TInput >(_input);
				77	TOutput const outptr = static_cast<TOutput >(_output);
				78
				79	// To simplify the kernel, we process padded or non-NCHW-ordered input into
				80	// a form which can be consumed by the kernel. This data is stored here and
				81	// passed into the kernel as an array of N pointers (one per row of the
				82	// input).
				83	TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))];
				84	const TInput *inptrs[strategy::input_rows];
				85
				86	// Create an array for the output pointers
				87	TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
				88	TOutput **const outptr_array = _outptr_array;
				89
				90	// Allocate portions of the working space
				91	uint8_t const working_space = static_cast<uint8_t >(_working_space);
				92	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space);
				93
				94	// For each output tile, construct the requisite set of pointers and call
				95	// into the kernel.
				96	for (unsigned int batch = 0; batch < batches; batch++)
				97	{
				98	// Get batch pointers
				99	const auto inptr_batch = inptr + batch * ld_input_batch;
				100	const auto outptr_batch = outptr + batch * ld_output_batch;
				101
				102	for (int start_out_i = start_out_height;
				103	start_out_i < end_out_height;
				104	start_out_i += static_cast<int>(strategy::output_rows))
				105	{
				106	const int end_out_i = start_out_i + strategy::output_rows;
				107	const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
				108	const int end_in_i = start_in_i + strategy::input_rows;
				109
				110	// Compute top/bottom padding
				111	const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
				112	const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
				113	const unsigned int valid_output_rows = std::min(
				114	end_out_i - start_out_i,
				115	static_cast<int>(output_height) - start_out_i
				116	);
				117
				118	for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
				119	{
				120	const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
				121	const int pad_left = -std::min(0, start_in_j);
				122
				123	const int end_out_j = start_out_j + strategy::output_cols;
				124	const int end_in_j = start_in_j + strategy::input_cols;
				125
				126	const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
				127	const unsigned int valid_output_cols = std::min(
				128	end_out_j - start_out_j,
				129	static_cast<int>(output_width) - start_out_j
				130	);
				131
				132	// Construct the output pointer array.
				133	TOutput **outptr_pos = outptr_array;
				134	for (auto i = 0u; i < valid_output_rows; i++)
				135	{
				136	unsigned int j = 0u;
				137	TOutput colptr = outptr_batch + (start_out_i + i) ld_output_row + start_out_j * ld_output_col;
				138	for (; j < valid_output_cols; j++)
				139	{
				140	*(outptr_pos++) = colptr;
				141	colptr += ld_output_col;
				142	}
				143	for (; j < strategy::output_cols; j++)
				144	{
				145	*(outptr_pos++) = output_buffer;
				146	}
				147	}
				148	for (auto i = valid_output_rows; i < strategy::output_rows; i++)
				149	{
				150	for (auto j = 0u; j < strategy::output_cols; j++)
				151	{
				152	*(outptr_pos++) = output_buffer;
				153	}
				154	}
				155
				156	start_out_j += strategy::output_cols;
				157
				158	const uint8_t params = static_cast<const uint8_t >(parameters);
				159
				160	// Loop over the input channels
				161	for (unsigned int in_c = 0; in_c < input_channels; in_c++)
				162	{
				163	// Construct the input array - first fill with padding values and
				164	// then fill in correct values.
				165	for (unsigned int i = 0; i < strategy::input_rows; i++)
				166	{
				167	for (unsigned int j = 0;
				168	j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++)
				169	{
				170	rearranged_input[i][j] = pad_value;
				171	}
				172	inptrs[i] = rearranged_input[i];
				173	}
				174
				175	auto inptr_row = inptr_batch + in_c +
				176	(start_in_i + pad_top) * ld_input_row +
				177	(start_in_j + pad_left) * ld_input_col;
				178	if (ld_input_col == 1 && !pad_left &&
				179	start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width)
				180	{
				181	// The input tensor is already in NCHW format, and we're reading
				182	// an unpadded section of it - allow the kernel to read it
				183	// directly.
				184	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				185	{
				186	inptrs[i] = inptr_row;
				187	inptr_row += ld_input_row;
				188	}
				189	}
				190	else
				191	{
				192	// Either the input tensor isn't in NCHW format, or we're reading
				193	// a padded section. Copy the relevant portion of the input here
				194	// and allow the kernel to read this.
				195	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				196	{
				197	auto inptr_col = inptr_row;
				198	for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
				199	{
				200	rearranged_input[i][j] = *inptr_col;
				201	inptr_col += ld_input_col;
				202	}
				203	inptr_row += ld_input_row;
				204	}
				205	}
				206
				207	execute_tile(inptrs, outptr_array, params);
				208
				209	// Progress the output pointers
				210	TOutput **outptr_pos = outptr_array;
				211	for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
				212	{
				213	outptr_pos[i] += args.channel_multiplier;
				214	}
				215
				216	// Progress the pointer into the parameters
				217	params += param_stride;
				218	}
				219	}
				220	}
				221	}
				222	}
				223	}
				224
				225	template <class strategy>
				226	class DepthwiseDepthfirstWithMultiplier :
				227	public DepthwiseCommon<typename strategy::input_type,
				228	typename strategy::weight_type,
				229	typename strategy::return_type>
				230	{
				231	using TInput = typename strategy::input_type;
				232	using TWeight = typename strategy::weight_type;
				233	using TOutput = typename strategy::return_type;
				234	using TAccum = typename strategy::bias_type;
				235
				236	size_t sizeof_output_buffer(unsigned int n_channels) const
				237	{
				238	const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
				239	const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
				240	return sizeof(TOutput) * rounded_channels;
				241	}
				242
				243	public:
				244	DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
				245	{
				246	}
				247
				248	DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete;
				249	DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete;
				250
				251	size_t get_storage_size(void) const override
				252	{
				253	// TODO What if we insert extra padding? Biases are a different size to the inputs, ...
				254	const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
				255	const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
				256	return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
				257	}
				258
				259	void pack_parameters(void _buffer, const void _biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
				260	{
				261	// TODO What if the kernel needs a different packing function?
				262
				263	// Cast the pointers
				264	float buffer = static_cast<float >(_buffer);
				265	const float biases = static_cast<const float >(_biases);
				266	const float const weights = static_cast<const float >(_weights);
				267
				268	const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
				269	ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
				270	ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
				271
				272	for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
				273	{
				274	for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
				275	{
				276	const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
				277	const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
				278
				279	// Copy across the correct amount of bias (or 0)
				280	for (unsigned int i = 0; i < todo; i++)
				281	{
				282	buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i];
				283	}
				284	buffer += vl;
				285
				286	// Copy each of the weights in turn
				287	auto weights_row = weights + out_c;
				288	for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
				289	{
				290	auto weights_col = weights_row;
				291
				292	for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
				293	{
				294	for (unsigned int m = 0; m < todo; m++)
				295	{
				296	buffer[m] = weights_col[m];
				297	}
				298	buffer += vl;
				299
				300	weights_col += ld_weight_col;
				301	}
				302
				303	weights_row += ld_weight_row;
				304	}
				305	}
				306	}
				307	}
				308
				309	size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
				310	{
				311	const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
				312	return n_threads * sizeof_output_buffer(n_output_channels);
				313	}
				314
				315	using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
				316	void execute(
				317	const unsigned int batches,
				318	const unsigned int input_height,
				319	const unsigned int input_width,
				320	const unsigned int input_channels,
				321	const PaddingValues &padding,
				322	const void *const _input,
				323	const size_t ld_input_col,
				324	const size_t ld_input_row,
				325	const size_t ld_input_batch,
				326	const void *const parameters,
				327	const unsigned int output_height,
				328	const unsigned int output_width,
				329	void *const _output,
				330	const size_t ld_output_col,
				331	const size_t ld_output_row,
				332	const size_t ld_output_batch,
				333	void *const _working_space,
				334	const unsigned int thread_id,
				335	const unsigned int n_threads
				336	) const override
				337	{
				338	strategy strat(this->m_args.cpu_info);
				339	#ifdef CYCLE_PROFILING
				340	arm_gemm::profiler prof;
				341	#endif
				342
				343	// Compute activation values
				344	TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
				345	TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
				346
				347	switch (this->m_args.activation.type)
				348	{
				349	case arm_gemm::Activation::Type::BoundedReLU:
				350	activation_max = static_cast<TAccum>(this->m_args.activation.param1);
				351	// Fall through
				352	case arm_gemm::Activation::Type::ReLU:
				353	activation_min = static_cast<TAccum>(0);
				354	break;
				355	default:
				356	break;
				357	}
				358
				359	// Determine what portion of the work to do.
				360	const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
				361	const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
				362	const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
				363
				364	// Need a stride over blocks of parameters
				365	const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
				366	const unsigned int param_stride =
				367	arm_gemm::roundup(this->m_args.channel_multiplier, vl) *
				368	(sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
				369
				370	// Cast input and output pointers into the right types
				371	const TInput const inptr = static_cast<const TInput >(_input);
				372	TOutput const outptr = static_cast<TOutput >(_output);
				373
				374	// To simplify the kernel, we process padded or non-NCHW-ordered input into
				375	// a form which can be consumed by the kernel. This data is stored here and
				376	// passed into the kernel as an array of N pointers (one per row of the
				377	// input).
				378	TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
				379	const TInput *inptrs[strategy::input_rows];
				380
				381	// Create an array for the output pointers
				382	TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
				383	TOutput **const outptr_array = _outptr_array;
				384
				385	// Allocate portions of the working space
				386	uint8_t const working_space = static_cast<uint8_t >(_working_space) + get_working_size(thread_id, input_channels);
				387	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space);
				388
				389	// For each output tile, construct the requisite set of pointers and call
				390	// into the kernel.
				391	for (unsigned int batch = 0; batch < batches; batch++)
				392	{
				393	// Get batch pointers
				394	const auto inptr_batch = inptr + batch * ld_input_batch;
				395	const auto outptr_batch = outptr + batch * ld_output_batch;
				396
				397	for (int start_out_i = start_out_height;
				398	start_out_i < end_out_height;
				399	start_out_i += static_cast<int>(strategy::output_rows))
				400	{
				401	const int end_out_i = start_out_i + strategy::output_rows;
				402	const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
				403	const int end_in_i = start_in_i + strategy::input_rows;
				404
				405	// Compute top/bottom padding
				406	const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
				407	const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
				408	const unsigned int valid_output_rows = std::min(
				409	end_out_i - start_out_i,
				410	static_cast<int>(output_height) - start_out_i
				411	);
				412
				413	for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
				414	{
				415	const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
				416	const int pad_left = -std::min(0, start_in_j);
				417
				418	const int end_out_j = start_out_j + strategy::output_cols;
				419	const int end_in_j = start_in_j + strategy::input_cols;
				420
				421	const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
				422	const unsigned int valid_output_cols = std::min(
				423	end_out_j - start_out_j,
				424	static_cast<int>(output_width) - start_out_j
				425	);
				426
				427	// Construct the output pointer array.
				428	TOutput **outptr_pos = outptr_array;
				429	for (auto i = 0u; i < valid_output_rows; i++)
				430	{
				431	unsigned int j = 0u;
				432	TOutput colptr = outptr_batch + (start_out_i + i) ld_output_row + start_out_j * ld_output_col;
				433	for (; j < valid_output_cols; j++)
				434	{
				435	*(outptr_pos++) = colptr;
				436	colptr += ld_output_col;
				437	}
				438	for (; j < strategy::output_cols; j++)
				439	{
				440	*(outptr_pos++) = output_buffer;
				441	}
				442	}
				443	for (auto i = valid_output_rows; i < strategy::output_rows; i++)
				444	{
				445	for (auto j = 0u; j < strategy::output_cols; j++)
				446	{
				447	*(outptr_pos++) = output_buffer;
				448	}
				449	}
				450
				451	start_out_j += strategy::output_cols;
				452
				453	const uint8_t params = static_cast<const uint8_t >(parameters);
				454
				455	// Loop over the input channels
				456	for (unsigned int in_c = 0; in_c < input_channels; in_c++)
				457	{
				458	// Construct the input array - first fill with padding values and
				459	// then fill in correct values.
				460	for (unsigned int i = 0; i < strategy::input_rows; i++)
				461	{
				462	for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
				463	{
				464	rearranged_input[i][j] = static_cast<TInput>(0);
				465	}
				466	inptrs[i] = rearranged_input[i];
				467	}
				468
				469	auto inptr_row = inptr_batch + in_c +
				470	(start_in_i + pad_top) * ld_input_row +
				471	(start_in_j + pad_left) * ld_input_col;
				472	if (ld_input_col == 1 && !pad_left &&
				473	start_in_j + 4 * strategy::input_col_quads < input_width)
				474	{
				475	// The input tensor is already in NCHW format, and we're reading
				476	// an unpadded section of it - allow the kernel to read it
				477	// directly.
				478	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				479	{
				480	inptrs[i] = inptr_row;
				481	inptr_row += ld_input_row;
				482	}
				483	}
				484	else
				485	{
				486	// Either the input tensor isn't in NCHW format, or we're reading
				487	// a padded section. Copy the relevant portion of the input here
				488	// and allow the kernel to read this.
				489	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				490	{
				491	auto inptr_col = inptr_row;
				492	for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
				493	{
				494	rearranged_input[i][j] = *inptr_col;
				495	inptr_col += ld_input_col;
				496	}
				497	inptr_row += ld_input_row;
				498	}
				499	}
				500
				501	{
				502	#ifdef CYCLE_PROFILING
				503	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
				504	#endif
				505	strat.kernel(
				506	inptrs, outptr_array, params,
				507	this->m_args.channel_multiplier,
				508	activation_min, activation_max
				509	);
				510	}
				511
				512	// Progress the output pointers
				513	TOutput **outptr_pos = outptr_array;
				514	for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
				515	{
				516	outptr_pos[i] += this->m_args.channel_multiplier;
				517	}
				518
				519	// Progress the pointer into the parameters
				520	params += param_stride;
				521	}
				522	}
				523	}
				524	}
				525	}
				526	};
				527
				528	} // namespace depthwise
				529	} // namespace arm_conv