blob: 2051f1b62f0cf73f64a6b770440add8622a2e185 [file] [log] [blame]
SiCong Lib63b1192022-01-28 18:24:39 +00001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
SiCong Li58a427f2022-05-10 10:15:59 +010024#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
SiCong Lib63b1192022-01-28 18:24:39 +000025#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
26#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
27#include "arm_compute/core/TensorInfo.h"
28#include "arm_compute/core/Validate.h"
29#include "arm_compute/core/experimental/DependencyGraph.h"
30#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
31#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
32#include "support/DeepCopy.h"
33
34#include <vector>
35
36namespace arm_compute
37{
38namespace experimental
39{
40namespace dynamic_fusion
41{
42struct ClKernelFusionGroup;
43
44/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
45 *
46 */
47struct ClKernelFusionGroup
48{
49public:
50 using Id = DependencyGraph::Id;
51
52 ClKernelFusionGroup() = default;
53 ClKernelFusionGroup(Id id)
54 : id{ id }, graph{}, fused_kernels{}, tensors{}
55 {
56 }
57 ~ClKernelFusionGroup() = default;
58
59 void set_id(Id i)
60 {
61 id = i;
62 }
63
64 Id add_fused_kernel(const ClKernel *kernel)
65 {
66 /// PRE: Acyclicity ensured by DependencyGraph
67 /// PRE: Connectedness ensured by DependencyGraph
68 /// PRE: Single-rootedness ensured by User
69 std::vector<Id> src_tensors;
70 for(const auto t : kernel->tensors().get_const_src_tensors())
71 {
72 auto id = graph.add_tensor(t->id);
73 if(tensors.find(id) == tensors.end())
74 {
75 tensors[id] = t;
76 }
77 src_tensors.push_back(id);
78 }
79 std::vector<Id> dst_tensors;
80 for(const auto t : kernel->tensors().get_const_dst_tensors())
81 {
82 auto id = graph.add_tensor(t->id);
83 if(tensors.find(id) == tensors.end())
84 {
85 tensors[id] = t;
86 }
87 dst_tensors.push_back(id);
88 }
89 auto id = graph.add_operator(src_tensors, dst_tensors);
90 fused_kernels[id.second] = kernel;
91 return id.second;
92 }
93
94 const ClKernel *get_root_kernel() const
95 {
96 auto root_kernels = graph.get_root_ops();
97 ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
98 return fused_kernels.at(root_kernels.at(0));
99 }
100
101 std::vector<const ClKernelTensor *> get_src_tensors() const
102 {
103 std::vector<const ClKernelTensor *> src_tensors;
104 for(auto tensor_id : graph.src_tensors())
105 {
106 src_tensors.push_back(tensors.at(tensor_id));
107 }
108 return src_tensors;
109 }
110
111 std::vector<const ClKernelTensor *> get_dst_tensors() const
112 {
113 std::vector<const ClKernelTensor *> dst_tensors;
114 for(auto tensor_id : graph.dst_tensors())
115 {
116 dst_tensors.push_back(tensors.at(tensor_id));
117 }
118 return dst_tensors;
119 }
120
121 friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
122 {
123 return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
124 }
125
126 Id id{};
127 DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
128 std::map<Id, const ClKernel *> fused_kernels{};
129 std::map<Id, const ClKernelTensor *> tensors{};
130};
131
132std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
133
134struct ClFusedKernelGraph
135{
136public:
137 using Id = DependencyGraph::Id;
138
139 using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
140
141 ClFusedKernelGraph() = default;
142 ~ClFusedKernelGraph() = default;
143 ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
144 ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
145 ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
146 ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
147
148 friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
149 {
150 /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
151 /// The fusion groups can already guarantee the equivalence of fusion
152 /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
153 return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
154 }
155
156 Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
157 {
158 auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
159 for(const auto k : fused_kernels)
160 {
161 fg->add_fused_kernel(k);
162 }
163 const auto src_tensors = fg->get_src_tensors();
164 const auto dst_tensors = fg->get_dst_tensors();
165 std::vector<Id> inputs{};
166 std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
167 {
168 return fg_dependency.add_tensor(kernel->id);
169 });
170 std::vector<Id> outputs{};
171 std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
172 {
173 return fg_dependency.add_tensor(kernel->id);
174 });
175 const auto id = fg_dependency.add_operator(inputs, outputs);
176 fg->set_id(id.second);
177 fusion_groups[id.second] = std::move(fg);
178 return id.second;
179 }
180
181 Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
182 {
183 /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
184 ClKernelFusionGroup *fg_src{};
185 ClKernelFusionGroup *fg_dst{};
186 // Find fg_src (parent / root) and fg_dst (child / non-root)
187 if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
188 {
189 fg_src = &fg0;
190 fg_dst = &fg1;
191 }
192 else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
193 {
194 fg_src = &fg1;
195 fg_dst = &fg0;
196 }
197 else
198 {
199 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
200 }
201
202 for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
203 {
204 if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
205 {
206 // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
207
208 // Before:
209 // fg_src
210 // |
211 // .. t1
212 // | |
213 // -> fg_dst <-
214 //
215 // After:
216 // fg_src <---t1
217 //
218 const auto st = link_src_tensors(fg_src->id, { t });
219 if(!bool(st))
220 {
221 return st;
222 }
223 }
224 else
225 {
226 const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
227 if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
228 {
229 // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
230 // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
231
232 // Before:
233 // fg_src
234 // |
235 // t0
236 // |
237 // -> fg_dst
238 //
239 // After:
240 // fg_src
241 //
242 const auto st = remove_fg_tensor(t);
243 if(!bool(st))
244 {
245 return st;
246 }
247 }
248 else
249 {
250 // If the tensors ARE in between fg_src and fg_dst
251 // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
252
253 // Before:
254 // fg_src
255 // |
256 // t0
257 // |
258 // |-----------
259 // | |
260 // -> fg_dst -> fg_other
261 //
262 // After:
263 // fg_src
264 // |
265 // t0
266 // |
267 // -> fg_other
268 //
269
270 // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
271 // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
272 // so that any unfused fg's (fg_other in this case) can read it.
273 // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
274 // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
275 // can be termed "dst tensors" to a fusion group
276 void();
277 }
278 }
279 }
280
281 for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
282 {
283 // Link any outgoing tensors of fg_dst to fg_src
284
285 // Before:
286 // fg_src
287 // |
288 // ..
289 // |
290 // -> fg_dst
291 // |
292 // |--------
293 // | |
294 // |-> t0 |-> t1
295 //
296 // After:
297 // fg_src
298 // |
299 // |--------
300 // | |
301 // |-> t0 |-> t1
302 //
303 const auto st = link_dst_tensors(fg_src->id, { t });
304 if(!bool(st))
305 {
306 return st;
307 }
308 }
309
310 // Merge fg_dst's graph into fg_src's graph
311 for(const auto kernel : traverse(*fg_dst))
312 {
313 fg_src->add_fused_kernel(kernel);
314 }
315
316 const auto st = remove_fg(fg_dst->id);
317 return st;
318 }
319 Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
320 {
321 /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
322 /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
323 /// Note that this does not apply to fusion groups
324 /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
325 /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
326 /// INV0: All Fusion groups have a single root
327 /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
328 /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
329 /// INV3: All non-roots are Simple kernels
330 /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
331 /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
332 const ClKernelFusionGroup *fg_src {};
333 const ClKernelFusionGroup *fg_dst{};
334
335 // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
336 // This guarantess INV0
337 // This also finds fg_src (parent / root) and fg_dst (child / non-root)
338 if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
339 {
340 fg_src = &fg0;
341 fg_dst = &fg1;
342 }
343 else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
344 {
345 fg_src = &fg1;
346 fg_dst = &fg0;
347 }
348 else
349 {
350 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
351 }
352
353 // Find unconnected tensors between fg_src and fg_dst
354 std::vector<Id> unconnected_tensors{};
355 for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
356 {
357 if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
358 {
359 unconnected_tensors.push_back(t);
360 }
361 }
362
363 // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
364 // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
365 for(const auto &t : unconnected_tensors)
366 {
367 if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
368 {
369 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
370 }
371 }
372
373 // Check 2: All non-root fgs are simple. Ensure INV3
374 if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
375 {
376 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
377 }
378
379 // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
380 const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
381 ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
382 const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
383
384 for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
385 {
386 const auto t_info = original_graph->get_tensor(t)->desc;
387 if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
388 {
389 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
390 }
391 }
392
393 // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
394 if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
395 {
396 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
397 }
398
399 return Status{};
400 }
401
402 const ClKernelGraph *original_graph{};
403 DependencyGraph fg_dependency{};
404 KernelFusionGroupMap fusion_groups{};
405 // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
406
407private:
408 Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
409 {
410 for(auto t : src_tensors)
411 {
412 fg_dependency.link_input(fg, t);
413 }
414 return Status{};
415 }
416 Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
417 {
418 for(auto t : dst_tensors)
419 {
420 fg_dependency.link_output(fg, t);
421 }
422 return Status{};
423 }
424 Status remove_fg(Id fg)
425 {
426 fg_dependency.remove_operator(fg);
427 fusion_groups.erase(fg);
428 return Status{};
429 }
430 Status remove_fg_tensor(Id tensor)
431 {
432 fg_dependency.remove_tensor(tensor);
433 return Status{};
434 }
435};
436
437std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
438std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
439
440std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
441
442Status fuse(ClFusedKernelGraph &fused_kernel_graph);
443
444Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
445
446Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
447
448} // namespace dynamic_fusion
449} // namespace experimental
450} // namespace arm_compute
SiCong Li58a427f2022-05-10 10:15:59 +0100451#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
452#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */