blob: 4bd3cd9d8b9c0b9a9671d78f727424fc845f7d49 [file] [log] [blame]
SiCong Lib63b1192022-01-28 18:24:39 +00001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
26#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
27#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
28#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Validate.h"
31#include "arm_compute/core/experimental/DependencyGraph.h"
32#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
33#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
34#include "support/DeepCopy.h"
35
36#include <vector>
37
38namespace arm_compute
39{
40namespace experimental
41{
42namespace dynamic_fusion
43{
44struct ClKernelFusionGroup;
45
46/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
47 *
48 */
49struct ClKernelFusionGroup
50{
51public:
52 using Id = DependencyGraph::Id;
53
54 ClKernelFusionGroup() = default;
55 ClKernelFusionGroup(Id id)
56 : id{ id }, graph{}, fused_kernels{}, tensors{}
57 {
58 }
59 ~ClKernelFusionGroup() = default;
60
61 void set_id(Id i)
62 {
63 id = i;
64 }
65
66 Id add_fused_kernel(const ClKernel *kernel)
67 {
68 /// PRE: Acyclicity ensured by DependencyGraph
69 /// PRE: Connectedness ensured by DependencyGraph
70 /// PRE: Single-rootedness ensured by User
71 std::vector<Id> src_tensors;
72 for(const auto t : kernel->tensors().get_const_src_tensors())
73 {
74 auto id = graph.add_tensor(t->id);
75 if(tensors.find(id) == tensors.end())
76 {
77 tensors[id] = t;
78 }
79 src_tensors.push_back(id);
80 }
81 std::vector<Id> dst_tensors;
82 for(const auto t : kernel->tensors().get_const_dst_tensors())
83 {
84 auto id = graph.add_tensor(t->id);
85 if(tensors.find(id) == tensors.end())
86 {
87 tensors[id] = t;
88 }
89 dst_tensors.push_back(id);
90 }
91 auto id = graph.add_operator(src_tensors, dst_tensors);
92 fused_kernels[id.second] = kernel;
93 return id.second;
94 }
95
96 const ClKernel *get_root_kernel() const
97 {
98 auto root_kernels = graph.get_root_ops();
99 ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
100 return fused_kernels.at(root_kernels.at(0));
101 }
102
103 std::vector<const ClKernelTensor *> get_src_tensors() const
104 {
105 std::vector<const ClKernelTensor *> src_tensors;
106 for(auto tensor_id : graph.src_tensors())
107 {
108 src_tensors.push_back(tensors.at(tensor_id));
109 }
110 return src_tensors;
111 }
112
113 std::vector<const ClKernelTensor *> get_dst_tensors() const
114 {
115 std::vector<const ClKernelTensor *> dst_tensors;
116 for(auto tensor_id : graph.dst_tensors())
117 {
118 dst_tensors.push_back(tensors.at(tensor_id));
119 }
120 return dst_tensors;
121 }
122
123 friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
124 {
125 return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
126 }
127
128 Id id{};
129 DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
130 std::map<Id, const ClKernel *> fused_kernels{};
131 std::map<Id, const ClKernelTensor *> tensors{};
132};
133
134std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
135
136struct ClFusedKernelGraph
137{
138public:
139 using Id = DependencyGraph::Id;
140
141 using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
142
143 ClFusedKernelGraph() = default;
144 ~ClFusedKernelGraph() = default;
145 ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
146 ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
147 ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
148 ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
149
150 friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
151 {
152 /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
153 /// The fusion groups can already guarantee the equivalence of fusion
154 /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
155 return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
156 }
157
158 Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
159 {
160 auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
161 for(const auto k : fused_kernels)
162 {
163 fg->add_fused_kernel(k);
164 }
165 const auto src_tensors = fg->get_src_tensors();
166 const auto dst_tensors = fg->get_dst_tensors();
167 std::vector<Id> inputs{};
168 std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
169 {
170 return fg_dependency.add_tensor(kernel->id);
171 });
172 std::vector<Id> outputs{};
173 std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
174 {
175 return fg_dependency.add_tensor(kernel->id);
176 });
177 const auto id = fg_dependency.add_operator(inputs, outputs);
178 fg->set_id(id.second);
179 fusion_groups[id.second] = std::move(fg);
180 return id.second;
181 }
182
183 Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
184 {
185 /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
186 ClKernelFusionGroup *fg_src{};
187 ClKernelFusionGroup *fg_dst{};
188 // Find fg_src (parent / root) and fg_dst (child / non-root)
189 if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
190 {
191 fg_src = &fg0;
192 fg_dst = &fg1;
193 }
194 else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
195 {
196 fg_src = &fg1;
197 fg_dst = &fg0;
198 }
199 else
200 {
201 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
202 }
203
204 for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
205 {
206 if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
207 {
208 // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
209
210 // Before:
211 // fg_src
212 // |
213 // .. t1
214 // | |
215 // -> fg_dst <-
216 //
217 // After:
218 // fg_src <---t1
219 //
220 const auto st = link_src_tensors(fg_src->id, { t });
221 if(!bool(st))
222 {
223 return st;
224 }
225 }
226 else
227 {
228 const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
229 if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
230 {
231 // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
232 // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
233
234 // Before:
235 // fg_src
236 // |
237 // t0
238 // |
239 // -> fg_dst
240 //
241 // After:
242 // fg_src
243 //
244 const auto st = remove_fg_tensor(t);
245 if(!bool(st))
246 {
247 return st;
248 }
249 }
250 else
251 {
252 // If the tensors ARE in between fg_src and fg_dst
253 // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
254
255 // Before:
256 // fg_src
257 // |
258 // t0
259 // |
260 // |-----------
261 // | |
262 // -> fg_dst -> fg_other
263 //
264 // After:
265 // fg_src
266 // |
267 // t0
268 // |
269 // -> fg_other
270 //
271
272 // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
273 // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
274 // so that any unfused fg's (fg_other in this case) can read it.
275 // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
276 // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
277 // can be termed "dst tensors" to a fusion group
278 void();
279 }
280 }
281 }
282
283 for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
284 {
285 // Link any outgoing tensors of fg_dst to fg_src
286
287 // Before:
288 // fg_src
289 // |
290 // ..
291 // |
292 // -> fg_dst
293 // |
294 // |--------
295 // | |
296 // |-> t0 |-> t1
297 //
298 // After:
299 // fg_src
300 // |
301 // |--------
302 // | |
303 // |-> t0 |-> t1
304 //
305 const auto st = link_dst_tensors(fg_src->id, { t });
306 if(!bool(st))
307 {
308 return st;
309 }
310 }
311
312 // Merge fg_dst's graph into fg_src's graph
313 for(const auto kernel : traverse(*fg_dst))
314 {
315 fg_src->add_fused_kernel(kernel);
316 }
317
318 const auto st = remove_fg(fg_dst->id);
319 return st;
320 }
321 Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
322 {
323 /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
324 /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
325 /// Note that this does not apply to fusion groups
326 /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
327 /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
328 /// INV0: All Fusion groups have a single root
329 /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
330 /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
331 /// INV3: All non-roots are Simple kernels
332 /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
333 /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
334 const ClKernelFusionGroup *fg_src {};
335 const ClKernelFusionGroup *fg_dst{};
336
337 // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
338 // This guarantess INV0
339 // This also finds fg_src (parent / root) and fg_dst (child / non-root)
340 if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
341 {
342 fg_src = &fg0;
343 fg_dst = &fg1;
344 }
345 else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
346 {
347 fg_src = &fg1;
348 fg_dst = &fg0;
349 }
350 else
351 {
352 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
353 }
354
355 // Find unconnected tensors between fg_src and fg_dst
356 std::vector<Id> unconnected_tensors{};
357 for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
358 {
359 if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
360 {
361 unconnected_tensors.push_back(t);
362 }
363 }
364
365 // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
366 // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
367 for(const auto &t : unconnected_tensors)
368 {
369 if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
370 {
371 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
372 }
373 }
374
375 // Check 2: All non-root fgs are simple. Ensure INV3
376 if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
377 {
378 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
379 }
380
381 // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
382 const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
383 ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
384 const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
385
386 for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
387 {
388 const auto t_info = original_graph->get_tensor(t)->desc;
389 if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
390 {
391 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
392 }
393 }
394
395 // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
396 if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
397 {
398 return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
399 }
400
401 return Status{};
402 }
403
404 const ClKernelGraph *original_graph{};
405 DependencyGraph fg_dependency{};
406 KernelFusionGroupMap fusion_groups{};
407 // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
408
409private:
410 Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
411 {
412 for(auto t : src_tensors)
413 {
414 fg_dependency.link_input(fg, t);
415 }
416 return Status{};
417 }
418 Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
419 {
420 for(auto t : dst_tensors)
421 {
422 fg_dependency.link_output(fg, t);
423 }
424 return Status{};
425 }
426 Status remove_fg(Id fg)
427 {
428 fg_dependency.remove_operator(fg);
429 fusion_groups.erase(fg);
430 return Status{};
431 }
432 Status remove_fg_tensor(Id tensor)
433 {
434 fg_dependency.remove_tensor(tensor);
435 return Status{};
436 }
437};
438
439std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
440std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
441
442std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
443
444Status fuse(ClFusedKernelGraph &fused_kernel_graph);
445
446Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
447
448Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
449
450} // namespace dynamic_fusion
451} // namespace experimental
452} // namespace arm_compute
453#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H