Blame - src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h - ml/ComputeLibrary

blob: 2051f1b62f0cf73f64a6b770440add8622a2e185 [file] [log] [blame]

SiCong Li	b63b119	2022-01-28 18:24:39 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
SiCong Li	58a427f	2022-05-10 10:15:59 +0100	[diff] [blame^]	24	#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
SiCong Li	b63b119	2022-01-28 18:24:39 +0000	[diff] [blame]	25	#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
				26	#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
				27	#include "arm_compute/core/TensorInfo.h"
				28	#include "arm_compute/core/Validate.h"
				29	#include "arm_compute/core/experimental/DependencyGraph.h"
				30	#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
				31	#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
				32	#include "support/DeepCopy.h"
				33
				34	#include <vector>
				35
				36	namespace arm_compute
				37	{
				38	namespace experimental
				39	{
				40	namespace dynamic_fusion
				41	{
				42	struct ClKernelFusionGroup;
				43
				44	/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
				45	*
				46	*/
				47	struct ClKernelFusionGroup
				48	{
				49	public:
				50	using Id = DependencyGraph::Id;
				51
				52	ClKernelFusionGroup() = default;
				53	ClKernelFusionGroup(Id id)
				54	: id{ id }, graph{}, fused_kernels{}, tensors{}
				55	{
				56	}
				57	~ClKernelFusionGroup() = default;
				58
				59	void set_id(Id i)
				60	{
				61	id = i;
				62	}
				63
				64	Id add_fused_kernel(const ClKernel *kernel)
				65	{
				66	/// PRE: Acyclicity ensured by DependencyGraph
				67	/// PRE: Connectedness ensured by DependencyGraph
				68	/// PRE: Single-rootedness ensured by User
				69	std::vector<Id> src_tensors;
				70	for(const auto t : kernel->tensors().get_const_src_tensors())
				71	{
				72	auto id = graph.add_tensor(t->id);
				73	if(tensors.find(id) == tensors.end())
				74	{
				75	tensors[id] = t;
				76	}
				77	src_tensors.push_back(id);
				78	}
				79	std::vector<Id> dst_tensors;
				80	for(const auto t : kernel->tensors().get_const_dst_tensors())
				81	{
				82	auto id = graph.add_tensor(t->id);
				83	if(tensors.find(id) == tensors.end())
				84	{
				85	tensors[id] = t;
				86	}
				87	dst_tensors.push_back(id);
				88	}
				89	auto id = graph.add_operator(src_tensors, dst_tensors);
				90	fused_kernels[id.second] = kernel;
				91	return id.second;
				92	}
				93
				94	const ClKernel *get_root_kernel() const
				95	{
				96	auto root_kernels = graph.get_root_ops();
				97	ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
				98	return fused_kernels.at(root_kernels.at(0));
				99	}
				100
				101	std::vector<const ClKernelTensor *> get_src_tensors() const
				102	{
				103	std::vector<const ClKernelTensor *> src_tensors;
				104	for(auto tensor_id : graph.src_tensors())
				105	{
				106	src_tensors.push_back(tensors.at(tensor_id));
				107	}
				108	return src_tensors;
				109	}
				110
				111	std::vector<const ClKernelTensor *> get_dst_tensors() const
				112	{
				113	std::vector<const ClKernelTensor *> dst_tensors;
				114	for(auto tensor_id : graph.dst_tensors())
				115	{
				116	dst_tensors.push_back(tensors.at(tensor_id));
				117	}
				118	return dst_tensors;
				119	}
				120
				121	friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
				122	{
				123	return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
				124	}
				125
				126	Id id{};
				127	DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
				128	std::map<Id, const ClKernel *> fused_kernels{};
				129	std::map<Id, const ClKernelTensor *> tensors{};
				130	};
				131
				132	std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
				133
				134	struct ClFusedKernelGraph
				135	{
				136	public:
				137	using Id = DependencyGraph::Id;
				138
				139	using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
				140
				141	ClFusedKernelGraph() = default;
				142	~ClFusedKernelGraph() = default;
				143	ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
				144	ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
				145	ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
				146	ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
				147
				148	friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
				149	{
				150	/// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
				151	/// The fusion groups can already guarantee the equivalence of fusion
				152	/// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
				153	return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
				154	}
				155
				156	Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
				157	{
				158	auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
				159	for(const auto k : fused_kernels)
				160	{
				161	fg->add_fused_kernel(k);
				162	}
				163	const auto src_tensors = fg->get_src_tensors();
				164	const auto dst_tensors = fg->get_dst_tensors();
				165	std::vector<Id> inputs{};
				166	std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
				167	{
				168	return fg_dependency.add_tensor(kernel->id);
				169	});
				170	std::vector<Id> outputs{};
				171	std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
				172	{
				173	return fg_dependency.add_tensor(kernel->id);
				174	});
				175	const auto id = fg_dependency.add_operator(inputs, outputs);
				176	fg->set_id(id.second);
				177	fusion_groups[id.second] = std::move(fg);
				178	return id.second;
				179	}
				180
				181	Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
				182	{
				183	/// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
				184	ClKernelFusionGroup *fg_src{};
				185	ClKernelFusionGroup *fg_dst{};
				186	// Find fg_src (parent / root) and fg_dst (child / non-root)
				187	if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
				188	{
				189	fg_src = &fg0;
				190	fg_dst = &fg1;
				191	}
				192	else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
				193	{
				194	fg_src = &fg1;
				195	fg_dst = &fg0;
				196	}
				197	else
				198	{
				199	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
				200	}
				201
				202	for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
				203	{
				204	if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
				205	{
				206	// Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
				207
				208	// Before:
				209	// fg_src
				210	// \|
				211	// .. t1
				212	// \| \|
				213	// -> fg_dst <-
				214	//
				215	// After:
				216	// fg_src <---t1
				217	//
				218	const auto st = link_src_tensors(fg_src->id, { t });
				219	if(!bool(st))
				220	{
				221	return st;
				222	}
				223	}
				224	else
				225	{
				226	const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
				227	if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
				228	{
				229	// Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
				230	// AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
				231
				232	// Before:
				233	// fg_src
				234	// \|
				235	// t0
				236	// \|
				237	// -> fg_dst
				238	//
				239	// After:
				240	// fg_src
				241	//
				242	const auto st = remove_fg_tensor(t);
				243	if(!bool(st))
				244	{
				245	return st;
				246	}
				247	}
				248	else
				249	{
				250	// If the tensors ARE in between fg_src and fg_dst
				251	// BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
				252
				253	// Before:
				254	// fg_src
				255	// \|
				256	// t0
				257	// \|
				258	// \|-----------
				259	// \| \|
				260	// -> fg_dst -> fg_other
				261	//
				262	// After:
				263	// fg_src
				264	// \|
				265	// t0
				266	// \|
				267	// -> fg_other
				268	//
				269
				270	// Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
				271	// intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
				272	// so that any unfused fg's (fg_other in this case) can read it.
				273	// So all this means that we not only can STORE the tensors at the "end" of a fusion group,
				274	// but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
				275	// can be termed "dst tensors" to a fusion group
				276	void();
				277	}
				278	}
				279	}
				280
				281	for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
				282	{
				283	// Link any outgoing tensors of fg_dst to fg_src
				284
				285	// Before:
				286	// fg_src
				287	// \|
				288	// ..
				289	// \|
				290	// -> fg_dst
				291	// \|
				292	// \|--------
				293	// \| \|
				294	// \|-> t0 \|-> t1
				295	//
				296	// After:
				297	// fg_src
				298	// \|
				299	// \|--------
				300	// \| \|
				301	// \|-> t0 \|-> t1
				302	//
				303	const auto st = link_dst_tensors(fg_src->id, { t });
				304	if(!bool(st))
				305	{
				306	return st;
				307	}
				308	}
				309
				310	// Merge fg_dst's graph into fg_src's graph
				311	for(const auto kernel : traverse(*fg_dst))
				312	{
				313	fg_src->add_fused_kernel(kernel);
				314	}
				315
				316	const auto st = remove_fg(fg_dst->id);
				317	return st;
				318	}
				319	Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
				320	{
				321	/// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
				322	/// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
				323	/// Note that this does not apply to fusion groups
				324	/// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
				325	/// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
				326	/// INV0: All Fusion groups have a single root
				327	/// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
				328	/// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
				329	/// INV3: All non-roots are Simple kernels
				330	/// INV4: All non roots' dst tensors have the same shape as that of the root kernel
				331	/// INV5: All kernels within a fusion group have the same UnitWorkloadStage
				332	const ClKernelFusionGroup *fg_src {};
				333	const ClKernelFusionGroup *fg_dst{};
				334
				335	// Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
				336	// This guarantess INV0
				337	// This also finds fg_src (parent / root) and fg_dst (child / non-root)
				338	if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
				339	{
				340	fg_src = &fg0;
				341	fg_dst = &fg1;
				342	}
				343	else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
				344	{
				345	fg_src = &fg1;
				346	fg_dst = &fg0;
				347	}
				348	else
				349	{
				350	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
				351	}
				352
				353	// Find unconnected tensors between fg_src and fg_dst
				354	std::vector<Id> unconnected_tensors{};
				355	for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
				356	{
				357	if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
				358	{
				359	unconnected_tensors.push_back(t);
				360	}
				361	}
				362
				363	// Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
				364	// This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
				365	for(const auto &t : unconnected_tensors)
				366	{
				367	if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
				368	{
				369	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
				370	}
				371	}
				372
				373	// Check 2: All non-root fgs are simple. Ensure INV3
				374	if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
				375	{
				376	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
				377	}
				378
				379	// Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
				380	const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
				381	ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
				382	const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
				383
				384	for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
				385	{
				386	const auto t_info = original_graph->get_tensor(t)->desc;
				387	if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
				388	{
				389	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
				390	}
				391	}
				392
				393	// Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
				394	if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
				395	{
				396	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
				397	}
				398
				399	return Status{};
				400	}
				401
				402	const ClKernelGraph *original_graph{};
				403	DependencyGraph fg_dependency{};
				404	KernelFusionGroupMap fusion_groups{};
				405	// Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
				406
				407	private:
				408	Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
				409	{
				410	for(auto t : src_tensors)
				411	{
				412	fg_dependency.link_input(fg, t);
				413	}
				414	return Status{};
				415	}
				416	Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
				417	{
				418	for(auto t : dst_tensors)
				419	{
				420	fg_dependency.link_output(fg, t);
				421	}
				422	return Status{};
				423	}
				424	Status remove_fg(Id fg)
				425	{
				426	fg_dependency.remove_operator(fg);
				427	fusion_groups.erase(fg);
				428	return Status{};
				429	}
				430	Status remove_fg_tensor(Id tensor)
				431	{
				432	fg_dependency.remove_tensor(tensor);
				433	return Status{};
				434	}
				435	};
				436
				437	std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
				438	std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
				439
				440	std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
				441
				442	Status fuse(ClFusedKernelGraph &fused_kernel_graph);
				443
				444	Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
				445
				446	Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
				447
				448	} // namespace dynamic_fusion
				449	} // namespace experimental
				450	} // namespace arm_compute
SiCong Li	58a427f	2022-05-10 10:15:59 +0100	[diff] [blame^]	451	#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
				452	#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */