Blame - src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h - ml/ComputeLibrary

blob: 4bd3cd9d8b9c0b9a9671d78f727424fc845f7d49 [file] [log] [blame]

SiCong Li	b63b119	2022-01-28 18:24:39 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
				25	#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
				26	#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
				27	#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
				28	#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Validate.h"
				31	#include "arm_compute/core/experimental/DependencyGraph.h"
				32	#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
				33	#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
				34	#include "support/DeepCopy.h"
				35
				36	#include <vector>
				37
				38	namespace arm_compute
				39	{
				40	namespace experimental
				41	{
				42	namespace dynamic_fusion
				43	{
				44	struct ClKernelFusionGroup;
				45
				46	/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
				47	*
				48	*/
				49	struct ClKernelFusionGroup
				50	{
				51	public:
				52	using Id = DependencyGraph::Id;
				53
				54	ClKernelFusionGroup() = default;
				55	ClKernelFusionGroup(Id id)
				56	: id{ id }, graph{}, fused_kernels{}, tensors{}
				57	{
				58	}
				59	~ClKernelFusionGroup() = default;
				60
				61	void set_id(Id i)
				62	{
				63	id = i;
				64	}
				65
				66	Id add_fused_kernel(const ClKernel *kernel)
				67	{
				68	/// PRE: Acyclicity ensured by DependencyGraph
				69	/// PRE: Connectedness ensured by DependencyGraph
				70	/// PRE: Single-rootedness ensured by User
				71	std::vector<Id> src_tensors;
				72	for(const auto t : kernel->tensors().get_const_src_tensors())
				73	{
				74	auto id = graph.add_tensor(t->id);
				75	if(tensors.find(id) == tensors.end())
				76	{
				77	tensors[id] = t;
				78	}
				79	src_tensors.push_back(id);
				80	}
				81	std::vector<Id> dst_tensors;
				82	for(const auto t : kernel->tensors().get_const_dst_tensors())
				83	{
				84	auto id = graph.add_tensor(t->id);
				85	if(tensors.find(id) == tensors.end())
				86	{
				87	tensors[id] = t;
				88	}
				89	dst_tensors.push_back(id);
				90	}
				91	auto id = graph.add_operator(src_tensors, dst_tensors);
				92	fused_kernels[id.second] = kernel;
				93	return id.second;
				94	}
				95
				96	const ClKernel *get_root_kernel() const
				97	{
				98	auto root_kernels = graph.get_root_ops();
				99	ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
				100	return fused_kernels.at(root_kernels.at(0));
				101	}
				102
				103	std::vector<const ClKernelTensor *> get_src_tensors() const
				104	{
				105	std::vector<const ClKernelTensor *> src_tensors;
				106	for(auto tensor_id : graph.src_tensors())
				107	{
				108	src_tensors.push_back(tensors.at(tensor_id));
				109	}
				110	return src_tensors;
				111	}
				112
				113	std::vector<const ClKernelTensor *> get_dst_tensors() const
				114	{
				115	std::vector<const ClKernelTensor *> dst_tensors;
				116	for(auto tensor_id : graph.dst_tensors())
				117	{
				118	dst_tensors.push_back(tensors.at(tensor_id));
				119	}
				120	return dst_tensors;
				121	}
				122
				123	friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
				124	{
				125	return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
				126	}
				127
				128	Id id{};
				129	DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
				130	std::map<Id, const ClKernel *> fused_kernels{};
				131	std::map<Id, const ClKernelTensor *> tensors{};
				132	};
				133
				134	std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
				135
				136	struct ClFusedKernelGraph
				137	{
				138	public:
				139	using Id = DependencyGraph::Id;
				140
				141	using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
				142
				143	ClFusedKernelGraph() = default;
				144	~ClFusedKernelGraph() = default;
				145	ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
				146	ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
				147	ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
				148	ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
				149
				150	friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
				151	{
				152	/// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
				153	/// The fusion groups can already guarantee the equivalence of fusion
				154	/// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
				155	return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
				156	}
				157
				158	Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
				159	{
				160	auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
				161	for(const auto k : fused_kernels)
				162	{
				163	fg->add_fused_kernel(k);
				164	}
				165	const auto src_tensors = fg->get_src_tensors();
				166	const auto dst_tensors = fg->get_dst_tensors();
				167	std::vector<Id> inputs{};
				168	std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
				169	{
				170	return fg_dependency.add_tensor(kernel->id);
				171	});
				172	std::vector<Id> outputs{};
				173	std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
				174	{
				175	return fg_dependency.add_tensor(kernel->id);
				176	});
				177	const auto id = fg_dependency.add_operator(inputs, outputs);
				178	fg->set_id(id.second);
				179	fusion_groups[id.second] = std::move(fg);
				180	return id.second;
				181	}
				182
				183	Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
				184	{
				185	/// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
				186	ClKernelFusionGroup *fg_src{};
				187	ClKernelFusionGroup *fg_dst{};
				188	// Find fg_src (parent / root) and fg_dst (child / non-root)
				189	if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
				190	{
				191	fg_src = &fg0;
				192	fg_dst = &fg1;
				193	}
				194	else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
				195	{
				196	fg_src = &fg1;
				197	fg_dst = &fg0;
				198	}
				199	else
				200	{
				201	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
				202	}
				203
				204	for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
				205	{
				206	if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
				207	{
				208	// Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
				209
				210	// Before:
				211	// fg_src
				212	// \|
				213	// .. t1
				214	// \| \|
				215	// -> fg_dst <-
				216	//
				217	// After:
				218	// fg_src <---t1
				219	//
				220	const auto st = link_src_tensors(fg_src->id, { t });
				221	if(!bool(st))
				222	{
				223	return st;
				224	}
				225	}
				226	else
				227	{
				228	const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
				229	if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
				230	{
				231	// Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
				232	// AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
				233
				234	// Before:
				235	// fg_src
				236	// \|
				237	// t0
				238	// \|
				239	// -> fg_dst
				240	//
				241	// After:
				242	// fg_src
				243	//
				244	const auto st = remove_fg_tensor(t);
				245	if(!bool(st))
				246	{
				247	return st;
				248	}
				249	}
				250	else
				251	{
				252	// If the tensors ARE in between fg_src and fg_dst
				253	// BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
				254
				255	// Before:
				256	// fg_src
				257	// \|
				258	// t0
				259	// \|
				260	// \|-----------
				261	// \| \|
				262	// -> fg_dst -> fg_other
				263	//
				264	// After:
				265	// fg_src
				266	// \|
				267	// t0
				268	// \|
				269	// -> fg_other
				270	//
				271
				272	// Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
				273	// intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
				274	// so that any unfused fg's (fg_other in this case) can read it.
				275	// So all this means that we not only can STORE the tensors at the "end" of a fusion group,
				276	// but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
				277	// can be termed "dst tensors" to a fusion group
				278	void();
				279	}
				280	}
				281	}
				282
				283	for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
				284	{
				285	// Link any outgoing tensors of fg_dst to fg_src
				286
				287	// Before:
				288	// fg_src
				289	// \|
				290	// ..
				291	// \|
				292	// -> fg_dst
				293	// \|
				294	// \|--------
				295	// \| \|
				296	// \|-> t0 \|-> t1
				297	//
				298	// After:
				299	// fg_src
				300	// \|
				301	// \|--------
				302	// \| \|
				303	// \|-> t0 \|-> t1
				304	//
				305	const auto st = link_dst_tensors(fg_src->id, { t });
				306	if(!bool(st))
				307	{
				308	return st;
				309	}
				310	}
				311
				312	// Merge fg_dst's graph into fg_src's graph
				313	for(const auto kernel : traverse(*fg_dst))
				314	{
				315	fg_src->add_fused_kernel(kernel);
				316	}
				317
				318	const auto st = remove_fg(fg_dst->id);
				319	return st;
				320	}
				321	Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
				322	{
				323	/// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
				324	/// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
				325	/// Note that this does not apply to fusion groups
				326	/// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
				327	/// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
				328	/// INV0: All Fusion groups have a single root
				329	/// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
				330	/// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
				331	/// INV3: All non-roots are Simple kernels
				332	/// INV4: All non roots' dst tensors have the same shape as that of the root kernel
				333	/// INV5: All kernels within a fusion group have the same UnitWorkloadStage
				334	const ClKernelFusionGroup *fg_src {};
				335	const ClKernelFusionGroup *fg_dst{};
				336
				337	// Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
				338	// This guarantess INV0
				339	// This also finds fg_src (parent / root) and fg_dst (child / non-root)
				340	if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
				341	{
				342	fg_src = &fg0;
				343	fg_dst = &fg1;
				344	}
				345	else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
				346	{
				347	fg_src = &fg1;
				348	fg_dst = &fg0;
				349	}
				350	else
				351	{
				352	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
				353	}
				354
				355	// Find unconnected tensors between fg_src and fg_dst
				356	std::vector<Id> unconnected_tensors{};
				357	for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
				358	{
				359	if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
				360	{
				361	unconnected_tensors.push_back(t);
				362	}
				363	}
				364
				365	// Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
				366	// This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
				367	for(const auto &t : unconnected_tensors)
				368	{
				369	if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
				370	{
				371	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
				372	}
				373	}
				374
				375	// Check 2: All non-root fgs are simple. Ensure INV3
				376	if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
				377	{
				378	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
				379	}
				380
				381	// Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
				382	const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
				383	ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
				384	const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
				385
				386	for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
				387	{
				388	const auto t_info = original_graph->get_tensor(t)->desc;
				389	if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
				390	{
				391	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
				392	}
				393	}
				394
				395	// Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
				396	if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
				397	{
				398	return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
				399	}
				400
				401	return Status{};
				402	}
				403
				404	const ClKernelGraph *original_graph{};
				405	DependencyGraph fg_dependency{};
				406	KernelFusionGroupMap fusion_groups{};
				407	// Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
				408
				409	private:
				410	Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
				411	{
				412	for(auto t : src_tensors)
				413	{
				414	fg_dependency.link_input(fg, t);
				415	}
				416	return Status{};
				417	}
				418	Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
				419	{
				420	for(auto t : dst_tensors)
				421	{
				422	fg_dependency.link_output(fg, t);
				423	}
				424	return Status{};
				425	}
				426	Status remove_fg(Id fg)
				427	{
				428	fg_dependency.remove_operator(fg);
				429	fusion_groups.erase(fg);
				430	return Status{};
				431	}
				432	Status remove_fg_tensor(Id tensor)
				433	{
				434	fg_dependency.remove_tensor(tensor);
				435	return Status{};
				436	}
				437	};
				438
				439	std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
				440	std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
				441
				442	std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
				443
				444	Status fuse(ClFusedKernelGraph &fused_kernel_graph);
				445
				446	Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
				447
				448	Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
				449
				450	} // namespace dynamic_fusion
				451	} // namespace experimental
				452	} // namespace arm_compute
				453	#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H