Blame - src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp - ml/ComputeLibrary

blob: 7c64e0be616fe665007475be653a42cc422872b3 [file] [log] [blame]

Michele Di Giorgio	d02d5ed	2021-01-22 09:47:04 +0000	[diff] [blame^]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
				27	#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
				28
				29	#ifdef CYCLE_PROFILING
				30	#include "profiler.hpp"
				31	#endif
				32
				33	namespace arm_conv {
				34	namespace depthwise {
				35
				36	namespace common
				37	{
				38	template <typename strategy, typename F>
				39	void depthwise_multiplier_execute(
				40	const F execute_tile,
				41	typename strategy::input_type pad_value,
				42	const DepthwiseArgs &args,
				43	const unsigned int batches,
				44	const unsigned int input_height,
				45	const unsigned int input_width,
				46	const unsigned int input_channels,
				47	const PaddingValues &padding,
				48	const void *const _input,
				49	const size_t ld_input_col,
				50	const size_t ld_input_row,
				51	const size_t ld_input_batch,
				52	const void *const parameters,
				53	const size_t param_stride,
				54	const unsigned int output_height,
				55	const unsigned int output_width,
				56	void *const _output,
				57	const size_t ld_output_col,
				58	const size_t ld_output_row,
				59	const size_t ld_output_batch,
				60	void *const _working_space,
				61	const unsigned int thread_id,
				62	const unsigned int n_threads
				63	)
				64	{
				65	using TInput = typename strategy::input_type;
				66	using TOutput = typename strategy::return_type;
				67
				68	// Determine what portion of the work to do.
				69	const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
				70	const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
				71	const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
				72
				73	// Cast input and output pointers into the right types
				74	const TInput const inptr = static_cast<const TInput >(_input);
				75	TOutput const outptr = static_cast<TOutput >(_output);
				76
				77	// To simplify the kernel, we process padded or non-NCHW-ordered input into
				78	// a form which can be consumed by the kernel. This data is stored here and
				79	// passed into the kernel as an array of N pointers (one per row of the
				80	// input).
				81	TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))];
				82	const TInput *inptrs[strategy::input_rows];
				83
				84	// Create an array for the output pointers
				85	TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
				86	TOutput **const outptr_array = _outptr_array;
				87
				88	// Allocate portions of the working space
				89	uint8_t const working_space = static_cast<uint8_t >(_working_space);
				90	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space);
				91
				92	// For each output tile, construct the requisite set of pointers and call
				93	// into the kernel.
				94	for (unsigned int batch = 0; batch < batches; batch++)
				95	{
				96	// Get batch pointers
				97	const auto inptr_batch = inptr + batch * ld_input_batch;
				98	const auto outptr_batch = outptr + batch * ld_output_batch;
				99
				100	for (int start_out_i = start_out_height;
				101	start_out_i < end_out_height;
				102	start_out_i += static_cast<int>(strategy::output_rows))
				103	{
				104	const int end_out_i = start_out_i + strategy::output_rows;
				105	const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
				106	const int end_in_i = start_in_i + strategy::input_rows;
				107
				108	// Compute top/bottom padding
				109	const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
				110	const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
				111	const unsigned int valid_output_rows = std::min(
				112	end_out_i - start_out_i,
				113	static_cast<int>(output_height) - start_out_i
				114	);
				115
				116	for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
				117	{
				118	const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
				119	const int pad_left = -std::min(0, start_in_j);
				120
				121	const int end_out_j = start_out_j + strategy::output_cols;
				122	const int end_in_j = start_in_j + strategy::input_cols;
				123
				124	const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
				125	const unsigned int valid_output_cols = std::min(
				126	end_out_j - start_out_j,
				127	static_cast<int>(output_width) - start_out_j
				128	);
				129
				130	// Construct the output pointer array.
				131	TOutput **outptr_pos = outptr_array;
				132	for (auto i = 0u; i < valid_output_rows; i++)
				133	{
				134	unsigned int j = 0u;
				135	TOutput colptr = outptr_batch + (start_out_i + i) ld_output_row + start_out_j * ld_output_col;
				136	for (; j < valid_output_cols; j++)
				137	{
				138	*(outptr_pos++) = colptr;
				139	colptr += ld_output_col;
				140	}
				141	for (; j < strategy::output_cols; j++)
				142	{
				143	*(outptr_pos++) = output_buffer;
				144	}
				145	}
				146	for (auto i = valid_output_rows; i < strategy::output_rows; i++)
				147	{
				148	for (auto j = 0u; j < strategy::output_cols; j++)
				149	{
				150	*(outptr_pos++) = output_buffer;
				151	}
				152	}
				153
				154	start_out_j += strategy::output_cols;
				155
				156	const uint8_t params = static_cast<const uint8_t >(parameters);
				157
				158	// Loop over the input channels
				159	for (unsigned int in_c = 0; in_c < input_channels; in_c++)
				160	{
				161	// Construct the input array - first fill with padding values and
				162	// then fill in correct values.
				163	for (unsigned int i = 0; i < strategy::input_rows; i++)
				164	{
				165	for (unsigned int j = 0;
				166	j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++)
				167	{
				168	rearranged_input[i][j] = pad_value;
				169	}
				170	inptrs[i] = rearranged_input[i];
				171	}
				172
				173	auto inptr_row = inptr_batch + in_c +
				174	(start_in_i + pad_top) * ld_input_row +
				175	(start_in_j + pad_left) * ld_input_col;
				176	if (ld_input_col == 1 && !pad_left &&
				177	start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width)
				178	{
				179	// The input tensor is already in NCHW format, and we're reading
				180	// an unpadded section of it - allow the kernel to read it
				181	// directly.
				182	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				183	{
				184	inptrs[i] = inptr_row;
				185	inptr_row += ld_input_row;
				186	}
				187	}
				188	else
				189	{
				190	// Either the input tensor isn't in NCHW format, or we're reading
				191	// a padded section. Copy the relevant portion of the input here
				192	// and allow the kernel to read this.
				193	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				194	{
				195	auto inptr_col = inptr_row;
				196	for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
				197	{
				198	rearranged_input[i][j] = *inptr_col;
				199	inptr_col += ld_input_col;
				200	}
				201	inptr_row += ld_input_row;
				202	}
				203	}
				204
				205	execute_tile(inptrs, outptr_array, params);
				206
				207	// Progress the output pointers
				208	TOutput **outptr_pos = outptr_array;
				209	for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
				210	{
				211	outptr_pos[i] += args.channel_multiplier;
				212	}
				213
				214	// Progress the pointer into the parameters
				215	params += param_stride;
				216	}
				217	}
				218	}
				219	}
				220	}
				221	}
				222
				223	template <class strategy>
				224	class DepthwiseDepthfirstWithMultiplier :
				225	public DepthwiseCommon<typename strategy::input_type,
				226	typename strategy::weight_type,
				227	typename strategy::return_type>
				228	{
				229	using TInput = typename strategy::input_type;
				230	using TWeight = typename strategy::weight_type;
				231	using TOutput = typename strategy::return_type;
				232	using TAccum = typename strategy::bias_type;
				233
				234	size_t sizeof_output_buffer(unsigned int n_channels) const
				235	{
				236	const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
				237	const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
				238	return sizeof(TOutput) * rounded_channels;
				239	}
				240
				241	public:
				242	DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
				243	{
				244	}
				245
				246	DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete;
				247	DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete;
				248
				249	size_t get_storage_size(void) const override
				250	{
				251	// TODO What if we insert extra padding? Biases are a different size to the inputs, ...
				252	const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
				253	const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
				254	return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
				255	}
				256
				257	void pack_parameters(void _buffer, const void _biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
				258	{
				259	// TODO What if the kernel needs a different packing function?
				260
				261	// Cast the pointers
				262	float buffer = static_cast<float >(_buffer);
				263	const float biases = static_cast<const float >(_biases);
				264	const float const weights = static_cast<const float >(_weights);
				265
				266	const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
				267	ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
				268	ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
				269
				270	for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
				271	{
				272	for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
				273	{
				274	const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
				275	const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
				276
				277	// Copy across the correct amount of bias (or 0)
				278	for (unsigned int i = 0; i < todo; i++)
				279	{
				280	buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i];
				281	}
				282	buffer += vl;
				283
				284	// Copy each of the weights in turn
				285	auto weights_row = weights + out_c;
				286	for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
				287	{
				288	auto weights_col = weights_row;
				289
				290	for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
				291	{
				292	for (unsigned int m = 0; m < todo; m++)
				293	{
				294	buffer[m] = weights_col[m];
				295	}
				296	buffer += vl;
				297
				298	weights_col += ld_weight_col;
				299	}
				300
				301	weights_row += ld_weight_row;
				302	}
				303	}
				304	}
				305	}
				306
				307	size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
				308	{
				309	const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
				310	return n_threads * sizeof_output_buffer(n_output_channels);
				311	}
				312
				313	using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
				314	void execute(
				315	const unsigned int batches,
				316	const unsigned int input_height,
				317	const unsigned int input_width,
				318	const unsigned int input_channels,
				319	const PaddingValues &padding,
				320	const void *const _input,
				321	const size_t ld_input_col,
				322	const size_t ld_input_row,
				323	const size_t ld_input_batch,
				324	const void *const parameters,
				325	const unsigned int output_height,
				326	const unsigned int output_width,
				327	void *const _output,
				328	const size_t ld_output_col,
				329	const size_t ld_output_row,
				330	const size_t ld_output_batch,
				331	void *const _working_space,
				332	const unsigned int thread_id,
				333	const unsigned int n_threads
				334	) const override
				335	{
				336	strategy strat(this->m_args.cpu_info);
				337	#ifdef CYCLE_PROFILING
				338	arm_gemm::profiler prof;
				339	#endif
				340
				341	// Compute activation values
				342	TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
				343	TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
				344
				345	switch (this->m_args.activation.type)
				346	{
				347	case arm_gemm::Activation::Type::BoundedReLU:
				348	activation_max = static_cast<TAccum>(this->m_args.activation.param1);
				349	// Fall through
				350	case arm_gemm::Activation::Type::ReLU:
				351	activation_min = static_cast<TAccum>(0);
				352	break;
				353	default:
				354	break;
				355	}
				356
				357	// Determine what portion of the work to do.
				358	const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
				359	const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
				360	const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
				361
				362	// Need a stride over blocks of parameters
				363	const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
				364	const unsigned int param_stride =
				365	arm_gemm::roundup(this->m_args.channel_multiplier, vl) *
				366	(sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
				367
				368	// Cast input and output pointers into the right types
				369	const TInput const inptr = static_cast<const TInput >(_input);
				370	TOutput const outptr = static_cast<TOutput >(_output);
				371
				372	// To simplify the kernel, we process padded or non-NCHW-ordered input into
				373	// a form which can be consumed by the kernel. This data is stored here and
				374	// passed into the kernel as an array of N pointers (one per row of the
				375	// input).
				376	TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
				377	const TInput *inptrs[strategy::input_rows];
				378
				379	// Create an array for the output pointers
				380	TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
				381	TOutput **const outptr_array = _outptr_array;
				382
				383	// Allocate portions of the working space
				384	uint8_t const working_space = static_cast<uint8_t >(_working_space) + get_working_size(thread_id, input_channels);
				385	TOutput const output_buffer = reinterpret_cast<TOutput >(working_space);
				386
				387	// For each output tile, construct the requisite set of pointers and call
				388	// into the kernel.
				389	for (unsigned int batch = 0; batch < batches; batch++)
				390	{
				391	// Get batch pointers
				392	const auto inptr_batch = inptr + batch * ld_input_batch;
				393	const auto outptr_batch = outptr + batch * ld_output_batch;
				394
				395	for (int start_out_i = start_out_height;
				396	start_out_i < end_out_height;
				397	start_out_i += static_cast<int>(strategy::output_rows))
				398	{
				399	const int end_out_i = start_out_i + strategy::output_rows;
				400	const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
				401	const int end_in_i = start_in_i + strategy::input_rows;
				402
				403	// Compute top/bottom padding
				404	const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
				405	const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
				406	const unsigned int valid_output_rows = std::min(
				407	end_out_i - start_out_i,
				408	static_cast<int>(output_height) - start_out_i
				409	);
				410
				411	for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
				412	{
				413	const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
				414	const int pad_left = -std::min(0, start_in_j);
				415
				416	const int end_out_j = start_out_j + strategy::output_cols;
				417	const int end_in_j = start_in_j + strategy::input_cols;
				418
				419	const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
				420	const unsigned int valid_output_cols = std::min(
				421	end_out_j - start_out_j,
				422	static_cast<int>(output_width) - start_out_j
				423	);
				424
				425	// Construct the output pointer array.
				426	TOutput **outptr_pos = outptr_array;
				427	for (auto i = 0u; i < valid_output_rows; i++)
				428	{
				429	unsigned int j = 0u;
				430	TOutput colptr = outptr_batch + (start_out_i + i) ld_output_row + start_out_j * ld_output_col;
				431	for (; j < valid_output_cols; j++)
				432	{
				433	*(outptr_pos++) = colptr;
				434	colptr += ld_output_col;
				435	}
				436	for (; j < strategy::output_cols; j++)
				437	{
				438	*(outptr_pos++) = output_buffer;
				439	}
				440	}
				441	for (auto i = valid_output_rows; i < strategy::output_rows; i++)
				442	{
				443	for (auto j = 0u; j < strategy::output_cols; j++)
				444	{
				445	*(outptr_pos++) = output_buffer;
				446	}
				447	}
				448
				449	start_out_j += strategy::output_cols;
				450
				451	const uint8_t params = static_cast<const uint8_t >(parameters);
				452
				453	// Loop over the input channels
				454	for (unsigned int in_c = 0; in_c < input_channels; in_c++)
				455	{
				456	// Construct the input array - first fill with padding values and
				457	// then fill in correct values.
				458	for (unsigned int i = 0; i < strategy::input_rows; i++)
				459	{
				460	for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
				461	{
				462	rearranged_input[i][j] = static_cast<TInput>(0);
				463	}
				464	inptrs[i] = rearranged_input[i];
				465	}
				466
				467	auto inptr_row = inptr_batch + in_c +
				468	(start_in_i + pad_top) * ld_input_row +
				469	(start_in_j + pad_left) * ld_input_col;
				470	if (ld_input_col == 1 && !pad_left &&
				471	start_in_j + 4 * strategy::input_col_quads < input_width)
				472	{
				473	// The input tensor is already in NCHW format, and we're reading
				474	// an unpadded section of it - allow the kernel to read it
				475	// directly.
				476	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				477	{
				478	inptrs[i] = inptr_row;
				479	inptr_row += ld_input_row;
				480	}
				481	}
				482	else
				483	{
				484	// Either the input tensor isn't in NCHW format, or we're reading
				485	// a padded section. Copy the relevant portion of the input here
				486	// and allow the kernel to read this.
				487	for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
				488	{
				489	auto inptr_col = inptr_row;
				490	for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
				491	{
				492	rearranged_input[i][j] = *inptr_col;
				493	inptr_col += ld_input_col;
				494	}
				495	inptr_row += ld_input_row;
				496	}
				497	}
				498
				499	{
				500	#ifdef CYCLE_PROFILING
				501	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
				502	#endif
				503	strat.kernel(
				504	inptrs, outptr_array, params,
				505	this->m_args.channel_multiplier,
				506	activation_min, activation_max
				507	);
				508	}
				509
				510	// Progress the output pointers
				511	TOutput **outptr_pos = outptr_array;
				512	for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
				513	{
				514	outptr_pos[i] += this->m_args.channel_multiplier;
				515	}
				516
				517	// Progress the pointer into the parameters
				518	params += param_stride;
				519	}
				520	}
				521	}
				522	}
				523	}
				524	};
				525
				526	} // namespace depthwise
				527	} // namespace arm_conv