Blame - src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp - ml/ComputeLibrary

blob: c3daaf04fea052d664f97f698b45583fc6117533 [file] [log] [blame]

ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	1	/*
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	2	* Copyright (c) 2022-2023 Arm Limited.
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	25	#pragma once
				26
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	27	#include "depthfirst_driver.hpp"
				28	#include "interleaves/generic.hpp"
				29
				30	namespace arm_conv {
				31	namespace depthwise {
				32
				33	template <typename OutputStage>
				34	class IPlanarStrategy
				35	{
				36	public:
				37	virtual ~IPlanarStrategy() = default;
				38	virtual unsigned int get_output_rows(void) const = 0;
				39	virtual arm_gemm::VLType get_vl_type(void) const = 0;
				40
				41	virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
				42	virtual void pack_parameters(
				43	const DepthwiseArgs &args, void *buffer,
				44	const void *biases, const OutputStage &,
				45	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				46	) const = 0;
				47	};
				48
				49
				50	template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
				51	typename OutputStage>
				52	struct PlanarKernelType;
				53
				54	template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
				55	struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
				56	{
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	57	typedef void (*Type)(
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	58	const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				59	unsigned int pad_top, unsigned int valid_input_rows,
				60	unsigned int pad_left, unsigned int valid_input_cols,
				61	const TWeight , const TAccum ,
				62	TOutput *, const size_t , const size_t *, unsigned int output_cols,
				63	unsigned int start_channels, unsigned int valid_channels,
				64	TAccum act_min, TAccum act_max
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	65	);
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	66
				67	template <typename WorkspaceType>
				68	static inline void execute(
				69	const Type fn,
				70	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				71	unsigned int pad_top, unsigned int valid_input_rows,
				72	unsigned int pad_left, unsigned int valid_input_cols,
				73	const TWeight weights, const TAccum bias,
				74	TOutput *outptrs, const size_t outlds, const size_t *outvllds, unsigned int output_cols,
				75	unsigned int start_channel, unsigned int valid_channels,
				76	const Nothing &, const WorkspaceType *ws
				77	)
				78	{
				79	fn(
				80	inptr, ld_in_row, ld_in_col, ld_in_vl,
				81	pad_top, valid_input_rows,
				82	pad_left, valid_input_cols,
				83	weights, bias,
				84	outptrs, outlds, outvllds, output_cols,
				85	start_channel, valid_channels,
				86	ws->activation_min, ws->activation_max
				87	);
				88	}
				89	};
				90
				91	template <typename TInput, typename TWeight, typename TOutput>
				92	struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
				93	{
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	94	typedef void (*Type)(
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	95	const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				96	unsigned int pad_top, unsigned int valid_input_rows,
				97	unsigned int pad_left, unsigned int valid_input_cols,
				98	const TWeight *,
				99	TOutput *, const size_t , const size_t *, unsigned int output_cols,
				100	unsigned int start_channel, unsigned int valid_channels,
				101	const arm_gemm::Requantize32 &
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame]	102	);
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	103
				104	template <typename WorkspaceType>
				105	static inline void execute(
				106	const Type fn,
				107	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				108	unsigned int pad_top, unsigned int valid_input_rows,
				109	unsigned int pad_left, unsigned int valid_input_cols,
				110	const TWeight weights, const int32_t ,
				111	TOutput *outptrs, const size_t outlds, const size_t *outldvls, unsigned int output_cols,
				112	unsigned int first_channel, unsigned int valid_channels,
				113	const arm_gemm::Requantize32 &qp, const WorkspaceType *
				114	)
				115	{
				116	fn(
				117	inptr, ld_in_row, ld_in_col, ld_in_vl,
				118	pad_top, valid_input_rows,
				119	pad_left, valid_input_cols,
				120	weights,
				121	outptrs, outlds, outldvls, output_cols,
				122	first_channel, valid_channels,
				123	qp
				124	);
				125	}
				126	};
				127
				128
				129	template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
				130	typename TAccum=typename DefaultTAccum<TOutput>::Type,
				131	typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
				132	class PlanarStrategy : public IPlanarStrategy<OutputStage>
				133	{
				134	unsigned int m_kernel_rows, m_kernel_cols;
				135	unsigned int m_stride_rows, m_stride_cols;
				136	unsigned int m_output_rows;
				137	arm_gemm::VLType m_vl_type;
				138
				139	protected:
				140	virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
				141	{
				142	// Get the kernel point to pack at the given index; return false to
				143	// indicate that this index (and all greater indices) is out of range.
				144	if (m_kernel_rows * m_kernel_cols <= index)
				145	return false;
				146
				147	y = index % m_kernel_cols;
				148	x = index / m_kernel_cols;
				149	return true;
				150	}
				151
				152	virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
				153	{
				154	return interleaves::PackingArguments(
				155	m_kernel_rows, m_kernel_cols, sizeof(TWeight),
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame^]	156	false, sizeof(TAccum), true, // Don't pack the bias
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	157	m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
				158	[this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
				159	{ return this->get_kernel_packing_point(idx, x, y); }
				160	);
				161	}
				162
				163	public:
				164	PlanarStrategy(
				165	unsigned int kernel_rows, unsigned int kernel_cols,
				166	unsigned int stride_rows, unsigned int stride_cols,
				167	unsigned int output_rows,
				168	arm_gemm::VLType vl_type
				169	) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
				170	m_stride_rows(stride_rows), m_stride_cols(stride_cols),
				171	m_output_rows(output_rows), m_vl_type(vl_type)
				172	{
				173	}
				174
				175	unsigned int get_output_rows(void) const override { return m_output_rows; }
				176	arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
				177
				178	size_t get_storage_size(const DepthwiseArgs &args) const override
				179	{
				180	return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
				181	}
				182
				183	void pack_parameters(
				184	const DepthwiseArgs &args, void *buffer,
				185	const void *biases, const OutputStage &,
				186	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				187	) const override
				188	{
				189	interleaves::pack_parameters_generic(
				190	this->get_kernel_packing_arguments(), args,
				191	buffer, biases, weights, ld_weight_col, ld_weight_row
				192	);
				193	}
				194
				195	using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
				196	virtual KernelType get_kernel(void) const = 0;
				197	};
				198
				199
				200	namespace {
				201
				202	template <typename T>
				203	struct OutputRowPtrsElement
				204	{
				205	struct Workspace
				206	{
				207	T **output_row_ptrs;
				208	size_t *output_ld_cols;
				209	size_t *output_ld_vls; // Stride between vectors of channels
				210	T *output_padding_buffer;
				211	};
				212
				213	template <typename OutputStage>
				214	static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
				215	{
				216	// We need one pointer and stride for each row of output, and an additional
				217	// blob of memory into which padded stores can go.
				218	return args.strategy->get_output_rows() * (sizeof(T ) + 2sizeof(size_t)) +
				219	get_vector_length<char>(args.strategy->get_vl_type());
				220	}
				221
				222	template <typename WorkspaceType, typename OutputStage>
				223	static void initialise(WorkspaceType ws, void *buffer,
				224	const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
				225	{
				226	const auto n_rows = args.strategy->get_output_rows();
				227	ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
				228	ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
				229	ws->output_ld_vls = ws->output_ld_cols + n_rows;
				230	ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
				231	return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
				232	}
				233	};
				234
				235	} // namespace {anonymous}
				236
				237
				238	template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
				239	typename TAccum=typename DefaultTAccum<TOutput>::Type,
				240	typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
				241	class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
				242	{
				243	using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
				244	using StrategyType = IPlanarStrategy<OutputStage>;
				245	using WorkspaceManager = Workspace<
				246	OutputRowPtrsElement<TOutput>,
				247	ActivationsElement<TAccum, OutputStage>
				248	>;
				249	using WorkspaceType = typename WorkspaceManager::WorkspaceType;
				250
				251	std::unique_ptr<StrategyType> m_strat;
				252	const TAccum *m_bias;
				253	OutputStage m_os;
				254
				255	public:
				256	DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
				257	: Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
				258	{
				259	}
				260
Viet-Hoa Do	03b2971	2022-06-01 11:47:14 +0100	[diff] [blame]	261	DepthwisePlanar(DepthwisePlanar &) = delete;
				262	DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
				263
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	264	size_t get_storage_size(void) const override
				265	{
				266	return m_strat->get_storage_size(this->m_args);
				267	}
				268
				269	void pack_parameters(
				270	void buffer, const void biases,
				271	const void *weights, size_t ld_weight_col, size_t ld_weight_row
				272	) override
				273	{
				274	m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
				275	this->m_bias = reinterpret_cast<const TAccum *>(biases);
				276	depthwise_depthfirst::stash_bias(this->m_os, biases);
				277	}
				278
Michael Tyler	8deee9b	2023-06-30 11:26:05 +0100	[diff] [blame^]	279	size_t get_working_size(unsigned int n_threads) const override
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	280	{
				281	return this->get_working_size_per_thread() * n_threads;
				282	}
				283
				284	protected:
				285	/* Compute the amount of working space required for a single thread. */
				286	virtual size_t get_working_size_per_thread(void) const
				287	{
				288	return WorkspaceManager::get_sizeof_workspace(
				289	WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
				290	}
				291
				292	/* Initialise the working space for a thread. */
				293	virtual void initialise_working_space(void *buffer) const
				294	{
				295	WorkspaceManager::initialise(
				296	buffer,
				297	WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
				298	);
				299	}
				300
				301	/* Execute the kernel for a given chunk of work. */
				302	virtual void execute_kernel(
				303	const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
				304	unsigned int pad_top, unsigned int valid_input_rows,
				305	unsigned int pad_left, unsigned int valid_input_cols,
				306	const TWeight weights, const TAccum bias,
				307	TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
				308	unsigned int valid_output_rows, unsigned int valid_output_cols,
				309	unsigned int first_channel, unsigned int valid_channels,
				310	WorkspaceType *ws
				311	) const
				312	{
				313	// Initialise the output pointers
				314	for (auto i = 0u; i < m_strat->get_output_rows(); i++)
				315	{
				316	// Point at the output tensor for all valid rows; otherwise point at the
				317	// padding buffer.
				318	ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
				319	ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
				320	ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
				321	outptr += ld_out_row;
				322	}
				323
				324	// Execute the kernel
				325	PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
				326	reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
				327	inptr, ld_in_row, ld_in_col, ld_in_vl,
				328	pad_top, valid_input_rows, pad_left, valid_input_cols,
				329	weights, bias,
				330	ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
				331	valid_output_cols, first_channel, valid_channels,
				332	this->m_os, ws
				333	);
				334	}
				335
				336	void execute_internal(
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	337	const DepthwiseArgs &args,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	338	const void *input,
				339	size_t ld_input_col,
				340	size_t ld_input_row,
				341	size_t ld_input_batch,
				342	const void *parameters,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	343	void *output,
				344	size_t ld_output_col,
				345	size_t ld_output_row,
				346	size_t ld_output_batch,
				347	void *working_space,
				348	unsigned int thread_id,
				349	unsigned int n_threads
				350	) const override
				351	{
				352	// Get and initialise the working space for this thread.
				353	void *thread_working_space =
				354	static_cast<uint8_t >(working_space) + thread_id this->get_working_size_per_thread();
				355	this->initialise_working_space(thread_working_space);
				356	auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
				357
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	358	const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	359	const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
				360
				361	// Get typed pointers
				362	auto input_batch = reinterpret_cast<const TInput *>(input);
				363	auto output_batch = reinterpret_cast<TOutput *>(output);
				364	auto weights = reinterpret_cast<const TWeight *>(parameters);
				365
				366	// Iterate over batches
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	367	for (auto batches = args.n_batches; batches; batches--)
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	368	{
				369	// NOTE: Other loop orderings are possible and it would be worth
				370	// investigating them.
				371
				372	// Within a batch, stripe threads across rows.
				373	for (auto start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	374	start_output_i < args.output_rows;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	375	start_output_i += n_threads * m_strat->get_output_rows())
				376	{
				377	// Determine what (if any padding) is required on the top/bottom of
				378	// this row of the convolution.
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	379	const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	380	const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
				381	const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	382	const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
				383	const unsigned int valid_output_rows = args.output_rows - start_output_i;
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	384
				385	auto inptr_row = input_batch + input_i*ld_input_row;
				386	auto outptr_row = output_batch + start_output_i * ld_output_row;
				387
				388	// Execute the kernel
				389	this->execute_kernel(
				390	inptr_row, ld_input_row, ld_input_col, vl,
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	391	input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	392	weights, this->m_bias,
				393	outptr_row, ld_output_row, ld_output_col, vl,
Pablo Marquez Tello	4e2bbbb	2023-01-09 17:21:01 +0000	[diff] [blame]	394	valid_output_rows, args.output_cols,
ramelg01	8a16488	2022-04-07 02:42:52 +0100	[diff] [blame]	395	0 /* first channel */, n_output_channels,
				396	ws
				397	);
				398	}
				399
				400	// Update the input and output pointers to account for batch
				401	input_batch += ld_input_batch;
				402	output_batch += ld_output_batch;
				403	}
				404	}
				405	};
				406
				407	} // namespace depthwise
				408	} // namespace arm_conv