Blame - src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp - ml/ComputeLibrary

blob: 2b2e6f355505a02e9a72ff92141458fe86b4a0f2 [file] [log] [blame]

ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	1	/*
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	2	* Copyright (c) 2022-2023 Arm Limited.
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#include "depthfirst_driver.hpp"
				26	#include "interleaves/generic.hpp"
				27
				28	namespace arm_conv {
				29	namespace depthwise {
				30
				31	template <typename OutputStage>
				32	class IPlanarStrategy
				33	{
				34	public:
				35	virtual ~IPlanarStrategy() = default;
				36	virtual unsigned int get_output_rows(void) const = 0;
				37	virtual arm_gemm::VLType get_vl_type(void) const = 0;
				38
				39	virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
				40	virtual void pack_parameters(
				41	const DepthwiseArgs &args, void *buffer,
				42	const void *biases, const OutputStage &,
				43	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				44	) const = 0;
				45	};
				46
				47
				48	template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
				49	typename OutputStage>
				50	struct PlanarKernelType;
				51
				52	template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
				53	struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
				54	{
				55	using Type = std::function<void(
				56	const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				57	unsigned int pad_top, unsigned int valid_input_rows,
				58	unsigned int pad_left, unsigned int valid_input_cols,
				59	const TWeight , const TAccum ,
				60	TOutput *, const size_t , const size_t *, unsigned int output_cols,
				61	unsigned int start_channels, unsigned int valid_channels,
				62	TAccum act_min, TAccum act_max
				63	)>;
				64
				65	template <typename WorkspaceType>
				66	static inline void execute(
				67	const Type fn,
				68	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				69	unsigned int pad_top, unsigned int valid_input_rows,
				70	unsigned int pad_left, unsigned int valid_input_cols,
				71	const TWeight weights, const TAccum bias,
				72	TOutput *outptrs, const size_t outlds, const size_t *outvllds, unsigned int output_cols,
				73	unsigned int start_channel, unsigned int valid_channels,
				74	const Nothing &, const WorkspaceType *ws
				75	)
				76	{
				77	fn(
				78	inptr, ld_in_row, ld_in_col, ld_in_vl,
				79	pad_top, valid_input_rows,
				80	pad_left, valid_input_cols,
				81	weights, bias,
				82	outptrs, outlds, outvllds, output_cols,
				83	start_channel, valid_channels,
				84	ws->activation_min, ws->activation_max
				85	);
				86	}
				87	};
				88
				89	template <typename TInput, typename TWeight, typename TOutput>
				90	struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
				91	{
				92	using Type = std::function<void(
				93	const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				94	unsigned int pad_top, unsigned int valid_input_rows,
				95	unsigned int pad_left, unsigned int valid_input_cols,
				96	const TWeight *,
				97	TOutput *, const size_t , const size_t *, unsigned int output_cols,
				98	unsigned int start_channel, unsigned int valid_channels,
				99	const arm_gemm::Requantize32 &
				100	)>;
				101
				102	template <typename WorkspaceType>
				103	static inline void execute(
				104	const Type fn,
				105	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				106	unsigned int pad_top, unsigned int valid_input_rows,
				107	unsigned int pad_left, unsigned int valid_input_cols,
				108	const TWeight weights, const int32_t ,
				109	TOutput *outptrs, const size_t outlds, const size_t *outldvls, unsigned int output_cols,
				110	unsigned int first_channel, unsigned int valid_channels,
				111	const arm_gemm::Requantize32 &qp, const WorkspaceType *
				112	)
				113	{
				114	fn(
				115	inptr, ld_in_row, ld_in_col, ld_in_vl,
				116	pad_top, valid_input_rows,
				117	pad_left, valid_input_cols,
				118	weights,
				119	outptrs, outlds, outldvls, output_cols,
				120	first_channel, valid_channels,
				121	qp
				122	);
				123	}
				124	};
				125
				126
				127	template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
				128	typename TAccum=typename DefaultTAccum<TOutput>::Type,
				129	typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
				130	class PlanarStrategy : public IPlanarStrategy<OutputStage>
				131	{
				132	unsigned int m_kernel_rows, m_kernel_cols;
				133	unsigned int m_stride_rows, m_stride_cols;
				134	unsigned int m_output_rows;
				135	arm_gemm::VLType m_vl_type;
				136
				137	protected:
				138	virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
				139	{
				140	// Get the kernel point to pack at the given index; return false to
				141	// indicate that this index (and all greater indices) is out of range.
				142	if (m_kernel_rows * m_kernel_cols <= index)
				143	return false;
				144
				145	y = index % m_kernel_cols;
				146	x = index / m_kernel_cols;
				147	return true;
				148	}
				149
				150	virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
				151	{
				152	return interleaves::PackingArguments(
				153	m_kernel_rows, m_kernel_cols, sizeof(TWeight),
				154	false, sizeof(TAccum), // Don't pack the bias
				155	m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
				156	[this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
				157	{ return this->get_kernel_packing_point(idx, x, y); }
				158	);
				159	}
				160
				161	public:
				162	PlanarStrategy(
				163	unsigned int kernel_rows, unsigned int kernel_cols,
				164	unsigned int stride_rows, unsigned int stride_cols,
				165	unsigned int output_rows,
				166	arm_gemm::VLType vl_type
				167	) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
				168	m_stride_rows(stride_rows), m_stride_cols(stride_cols),
				169	m_output_rows(output_rows), m_vl_type(vl_type)
				170	{
				171	}
				172
				173	unsigned int get_output_rows(void) const override { return m_output_rows; }
				174	arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
				175
				176	size_t get_storage_size(const DepthwiseArgs &args) const override
				177	{
				178	return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
				179	}
				180
				181	void pack_parameters(
				182	const DepthwiseArgs &args, void *buffer,
				183	const void *biases, const OutputStage &,
				184	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				185	) const override
				186	{
				187	interleaves::pack_parameters_generic(
				188	this->get_kernel_packing_arguments(), args,
				189	buffer, biases, weights, ld_weight_col, ld_weight_row
				190	);
				191	}
				192
				193	using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
				194	virtual KernelType get_kernel(void) const = 0;
				195	};
				196
				197
				198	namespace {
				199
				200	template <typename T>
				201	struct OutputRowPtrsElement
				202	{
				203	struct Workspace
				204	{
				205	T **output_row_ptrs;
				206	size_t *output_ld_cols;
				207	size_t *output_ld_vls; // Stride between vectors of channels
				208	T *output_padding_buffer;
				209	};
				210
				211	template <typename OutputStage>
				212	static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
				213	{
				214	// We need one pointer and stride for each row of output, and an additional
				215	// blob of memory into which padded stores can go.
				216	return args.strategy->get_output_rows() * (sizeof(T ) + 2sizeof(size_t)) +
				217	get_vector_length<char>(args.strategy->get_vl_type());
				218	}
				219
				220	template <typename WorkspaceType, typename OutputStage>
				221	static void initialise(WorkspaceType ws, void *buffer,
				222	const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
				223	{
				224	const auto n_rows = args.strategy->get_output_rows();
				225	ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
				226	ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
				227	ws->output_ld_vls = ws->output_ld_cols + n_rows;
				228	ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
				229	return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
				230	}
				231	};
				232
				233	} // namespace {anonymous}
				234
				235
				236	template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
				237	typename TAccum=typename DefaultTAccum<TOutput>::Type,
				238	typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
				239	class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
				240	{
				241	using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
				242	using StrategyType = IPlanarStrategy<OutputStage>;
				243	using WorkspaceManager = Workspace<
				244	OutputRowPtrsElement<TOutput>,
				245	ActivationsElement<TAccum, OutputStage>
				246	>;
				247	using WorkspaceType = typename WorkspaceManager::WorkspaceType;
				248
				249	std::unique_ptr<StrategyType> m_strat;
				250	const TAccum *m_bias;
				251	OutputStage m_os;
				252
				253	public:
				254	DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
				255	: Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
				256	{
				257	}
				258
Viet-Hoa Do	03b2971	2022-06-01 11:47:14 +0100	[diff] [blame]	259	DepthwisePlanar(DepthwisePlanar &) = delete;
				260	DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
				261
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	262	size_t get_storage_size(void) const override
				263	{
				264	return m_strat->get_storage_size(this->m_args);
				265	}
				266
				267	void pack_parameters(
				268	void buffer, const void biases,
				269	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				270	) override
				271	{
				272	m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
				273	this->m_bias = reinterpret_cast<const TAccum *>(biases);
				274	depthwise_depthfirst::stash_bias(this->m_os, biases);
				275	}
				276
				277	size_t get_working_size(unsigned int n_threads, unsigned int) const override
				278	{
				279	return this->get_working_size_per_thread() * n_threads;
				280	}
				281
				282	protected:
				283	/* Compute the amount of working space required for a single thread. */
				284	virtual size_t get_working_size_per_thread(void) const
				285	{
				286	return WorkspaceManager::get_sizeof_workspace(
				287	WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
				288	}
				289
				290	/* Initialise the working space for a thread. */
				291	virtual void initialise_working_space(void *buffer) const
				292	{
				293	WorkspaceManager::initialise(
				294	buffer,
				295	WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
				296	);
				297	}
				298
				299	/* Execute the kernel for a given chunk of work. */
				300	virtual void execute_kernel(
				301	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				302	unsigned int pad_top, unsigned int valid_input_rows,
				303	unsigned int pad_left, unsigned int valid_input_cols,
				304	const TWeight weights, const TAccum bias,
				305	TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
				306	unsigned int valid_output_rows, unsigned int valid_output_cols,
				307	unsigned int first_channel, unsigned int valid_channels,
				308	WorkspaceType *ws
				309	) const
				310	{
				311	// Initialise the output pointers
				312	for (auto i = 0u; i < m_strat->get_output_rows(); i++)
				313	{
				314	// Point at the output tensor for all valid rows; otherwise point at the
				315	// padding buffer.
				316	ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
				317	ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
				318	ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
				319	outptr += ld_out_row;
				320	}
				321
				322	// Execute the kernel
				323	PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
				324	reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
				325	inptr, ld_in_row, ld_in_col, ld_in_vl,
				326	pad_top, valid_input_rows, pad_left, valid_input_cols,
				327	weights, bias,
				328	ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
				329	valid_output_cols, first_channel, valid_channels,
				330	this->m_os, ws
				331	);
				332	}
				333
				334	void execute_internal(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	335	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	336	const void *input,
				337	size_t ld_input_col,
				338	size_t ld_input_row,
				339	size_t ld_input_batch,
				340	const void *parameters,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	341	void *output,
				342	size_t ld_output_col,
				343	size_t ld_output_row,
				344	size_t ld_output_batch,
				345	void *working_space,
				346	unsigned int thread_id,
				347	unsigned int n_threads
				348	) const override
				349	{
				350	// Get and initialise the working space for this thread.
				351	void *thread_working_space =
				352	static_cast<uint8_t >(working_space) + thread_id this->get_working_size_per_thread();
				353	this->initialise_working_space(thread_working_space);
				354	auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
				355
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	356	const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	357	const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
				358
				359	// Get typed pointers
				360	auto input_batch = reinterpret_cast<const TInput *>(input);
				361	auto output_batch = reinterpret_cast<TOutput *>(output);
				362	auto weights = reinterpret_cast<const TWeight *>(parameters);
				363
				364	// Iterate over batches
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	365	for (auto batches = args.n_batches; batches; batches--)
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	366	{
				367	// NOTE: Other loop orderings are possible and it would be worth
				368	// investigating them.
				369
				370	// Within a batch, stripe threads across rows.
				371	for (auto start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	372	start_output_i < args.output_rows;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	373	start_output_i += n_threads * m_strat->get_output_rows())
				374	{
				375	// Determine what (if any padding) is required on the top/bottom of
				376	// this row of the convolution.
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	377	const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	378	const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
				379	const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	380	const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
				381	const unsigned int valid_output_rows = args.output_rows - start_output_i;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	382
				383	auto inptr_row = input_batch + input_i*ld_input_row;
				384	auto outptr_row = output_batch + start_output_i * ld_output_row;
				385
				386	// Execute the kernel
				387	this->execute_kernel(
				388	inptr_row, ld_input_row, ld_input_col, vl,
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	389	input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	390	weights, this->m_bias,
				391	outptr_row, ld_output_row, ld_output_col, vl,
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame^]	392	valid_output_rows, args.output_cols,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	393	0 /* first channel */, n_output_channels,
				394	ws
				395	);
				396	}
				397
				398	// Update the input and output pointers to account for batch
				399	input_batch += ld_input_batch;
				400	output_batch += ld_output_batch;
				401	}
				402	}
				403	};
				404
				405	} // namespace depthwise
				406	} // namespace arm_conv