SiCong Li | b63b119 | 2022-01-28 18:24:39 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2022 Arm Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
SiCong Li | 4e9f568 | 2022-05-10 10:15:59 +0100 | [diff] [blame^] | 24 | #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION |
SiCong Li | b63b119 | 2022-01-28 18:24:39 +0000 | [diff] [blame] | 25 | #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H |
| 26 | #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H |
| 27 | #include "arm_compute/core/TensorInfo.h" |
| 28 | #include "arm_compute/core/Validate.h" |
| 29 | #include "arm_compute/core/experimental/DependencyGraph.h" |
| 30 | #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" |
| 31 | #include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" |
| 32 | #include "support/DeepCopy.h" |
| 33 | |
| 34 | #include <vector> |
| 35 | |
| 36 | namespace arm_compute |
| 37 | { |
| 38 | namespace experimental |
| 39 | { |
| 40 | namespace dynamic_fusion |
| 41 | { |
| 42 | struct ClKernelFusionGroup; |
| 43 | |
| 44 | /** A const view of a subgraph of the @ref ClKernelGraph to be fused together |
| 45 | * |
| 46 | */ |
| 47 | struct ClKernelFusionGroup |
| 48 | { |
| 49 | public: |
| 50 | using Id = DependencyGraph::Id; |
| 51 | |
| 52 | ClKernelFusionGroup() = default; |
| 53 | ClKernelFusionGroup(Id id) |
| 54 | : id{ id }, graph{}, fused_kernels{}, tensors{} |
| 55 | { |
| 56 | } |
| 57 | ~ClKernelFusionGroup() = default; |
| 58 | |
| 59 | void set_id(Id i) |
| 60 | { |
| 61 | id = i; |
| 62 | } |
| 63 | |
| 64 | Id add_fused_kernel(const ClKernel *kernel) |
| 65 | { |
| 66 | /// PRE: Acyclicity ensured by DependencyGraph |
| 67 | /// PRE: Connectedness ensured by DependencyGraph |
| 68 | /// PRE: Single-rootedness ensured by User |
| 69 | std::vector<Id> src_tensors; |
| 70 | for(const auto t : kernel->tensors().get_const_src_tensors()) |
| 71 | { |
| 72 | auto id = graph.add_tensor(t->id); |
| 73 | if(tensors.find(id) == tensors.end()) |
| 74 | { |
| 75 | tensors[id] = t; |
| 76 | } |
| 77 | src_tensors.push_back(id); |
| 78 | } |
| 79 | std::vector<Id> dst_tensors; |
| 80 | for(const auto t : kernel->tensors().get_const_dst_tensors()) |
| 81 | { |
| 82 | auto id = graph.add_tensor(t->id); |
| 83 | if(tensors.find(id) == tensors.end()) |
| 84 | { |
| 85 | tensors[id] = t; |
| 86 | } |
| 87 | dst_tensors.push_back(id); |
| 88 | } |
| 89 | auto id = graph.add_operator(src_tensors, dst_tensors); |
| 90 | fused_kernels[id.second] = kernel; |
| 91 | return id.second; |
| 92 | } |
| 93 | |
| 94 | const ClKernel *get_root_kernel() const |
| 95 | { |
| 96 | auto root_kernels = graph.get_root_ops(); |
| 97 | ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1); |
| 98 | return fused_kernels.at(root_kernels.at(0)); |
| 99 | } |
| 100 | |
| 101 | std::vector<const ClKernelTensor *> get_src_tensors() const |
| 102 | { |
| 103 | std::vector<const ClKernelTensor *> src_tensors; |
| 104 | for(auto tensor_id : graph.src_tensors()) |
| 105 | { |
| 106 | src_tensors.push_back(tensors.at(tensor_id)); |
| 107 | } |
| 108 | return src_tensors; |
| 109 | } |
| 110 | |
| 111 | std::vector<const ClKernelTensor *> get_dst_tensors() const |
| 112 | { |
| 113 | std::vector<const ClKernelTensor *> dst_tensors; |
| 114 | for(auto tensor_id : graph.dst_tensors()) |
| 115 | { |
| 116 | dst_tensors.push_back(tensors.at(tensor_id)); |
| 117 | } |
| 118 | return dst_tensors; |
| 119 | } |
| 120 | |
| 121 | friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) |
| 122 | { |
| 123 | return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors; |
| 124 | } |
| 125 | |
| 126 | Id id{}; |
| 127 | DependencyGraph graph{}; // A subgraph of the original ClKernelGraph |
| 128 | std::map<Id, const ClKernel *> fused_kernels{}; |
| 129 | std::map<Id, const ClKernelTensor *> tensors{}; |
| 130 | }; |
| 131 | |
| 132 | std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group); |
| 133 | |
| 134 | struct ClFusedKernelGraph |
| 135 | { |
| 136 | public: |
| 137 | using Id = DependencyGraph::Id; |
| 138 | |
| 139 | using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>; |
| 140 | |
| 141 | ClFusedKernelGraph() = default; |
| 142 | ~ClFusedKernelGraph() = default; |
| 143 | ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default; |
| 144 | ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default; |
| 145 | ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default; |
| 146 | ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default; |
| 147 | |
| 148 | friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1) |
| 149 | { |
| 150 | /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison. |
| 151 | /// The fusion groups can already guarantee the equivalence of fusion |
| 152 | /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s |
| 153 | return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups; |
| 154 | } |
| 155 | |
| 156 | Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels) |
| 157 | { |
| 158 | auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>(); |
| 159 | for(const auto k : fused_kernels) |
| 160 | { |
| 161 | fg->add_fused_kernel(k); |
| 162 | } |
| 163 | const auto src_tensors = fg->get_src_tensors(); |
| 164 | const auto dst_tensors = fg->get_dst_tensors(); |
| 165 | std::vector<Id> inputs{}; |
| 166 | std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel) |
| 167 | { |
| 168 | return fg_dependency.add_tensor(kernel->id); |
| 169 | }); |
| 170 | std::vector<Id> outputs{}; |
| 171 | std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel) |
| 172 | { |
| 173 | return fg_dependency.add_tensor(kernel->id); |
| 174 | }); |
| 175 | const auto id = fg_dependency.add_operator(inputs, outputs); |
| 176 | fg->set_id(id.second); |
| 177 | fusion_groups[id.second] = std::move(fg); |
| 178 | return id.second; |
| 179 | } |
| 180 | |
| 181 | Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1) |
| 182 | { |
| 183 | /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold |
| 184 | ClKernelFusionGroup *fg_src{}; |
| 185 | ClKernelFusionGroup *fg_dst{}; |
| 186 | // Find fg_src (parent / root) and fg_dst (child / non-root) |
| 187 | if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) |
| 188 | { |
| 189 | fg_src = &fg0; |
| 190 | fg_dst = &fg1; |
| 191 | } |
| 192 | else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) |
| 193 | { |
| 194 | fg_src = &fg1; |
| 195 | fg_dst = &fg0; |
| 196 | } |
| 197 | else |
| 198 | { |
| 199 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; |
| 200 | } |
| 201 | |
| 202 | for(const auto &t : fg_dependency.src_tensors(fg_dst->id)) |
| 203 | { |
| 204 | if(!is_in(t, fg_dependency.dst_tensors(fg_src->id))) |
| 205 | { |
| 206 | // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src |
| 207 | |
| 208 | // Before: |
| 209 | // fg_src |
| 210 | // | |
| 211 | // .. t1 |
| 212 | // | | |
| 213 | // -> fg_dst <- |
| 214 | // |
| 215 | // After: |
| 216 | // fg_src <---t1 |
| 217 | // |
| 218 | const auto st = link_src_tensors(fg_src->id, { t }); |
| 219 | if(!bool(st)) |
| 220 | { |
| 221 | return st; |
| 222 | } |
| 223 | } |
| 224 | else |
| 225 | { |
| 226 | const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t); |
| 227 | if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id) |
| 228 | { |
| 229 | // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst |
| 230 | // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3)) |
| 231 | |
| 232 | // Before: |
| 233 | // fg_src |
| 234 | // | |
| 235 | // t0 |
| 236 | // | |
| 237 | // -> fg_dst |
| 238 | // |
| 239 | // After: |
| 240 | // fg_src |
| 241 | // |
| 242 | const auto st = remove_fg_tensor(t); |
| 243 | if(!bool(st)) |
| 244 | { |
| 245 | return st; |
| 246 | } |
| 247 | } |
| 248 | else |
| 249 | { |
| 250 | // If the tensors ARE in between fg_src and fg_dst |
| 251 | // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src |
| 252 | |
| 253 | // Before: |
| 254 | // fg_src |
| 255 | // | |
| 256 | // t0 |
| 257 | // | |
| 258 | // |----------- |
| 259 | // | | |
| 260 | // -> fg_dst -> fg_other |
| 261 | // |
| 262 | // After: |
| 263 | // fg_src |
| 264 | // | |
| 265 | // t0 |
| 266 | // | |
| 267 | // -> fg_other |
| 268 | // |
| 269 | |
| 270 | // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an |
| 271 | // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory |
| 272 | // so that any unfused fg's (fg_other in this case) can read it. |
| 273 | // So all this means that we not only can STORE the tensors at the "end" of a fusion group, |
| 274 | // but also any other tensors that are not source tensors. And all tensors that are STORED (exported), |
| 275 | // can be termed "dst tensors" to a fusion group |
| 276 | void(); |
| 277 | } |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) |
| 282 | { |
| 283 | // Link any outgoing tensors of fg_dst to fg_src |
| 284 | |
| 285 | // Before: |
| 286 | // fg_src |
| 287 | // | |
| 288 | // .. |
| 289 | // | |
| 290 | // -> fg_dst |
| 291 | // | |
| 292 | // |-------- |
| 293 | // | | |
| 294 | // |-> t0 |-> t1 |
| 295 | // |
| 296 | // After: |
| 297 | // fg_src |
| 298 | // | |
| 299 | // |-------- |
| 300 | // | | |
| 301 | // |-> t0 |-> t1 |
| 302 | // |
| 303 | const auto st = link_dst_tensors(fg_src->id, { t }); |
| 304 | if(!bool(st)) |
| 305 | { |
| 306 | return st; |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | // Merge fg_dst's graph into fg_src's graph |
| 311 | for(const auto kernel : traverse(*fg_dst)) |
| 312 | { |
| 313 | fg_src->add_fused_kernel(kernel); |
| 314 | } |
| 315 | |
| 316 | const auto st = remove_fg(fg_dst->id); |
| 317 | return st; |
| 318 | } |
| 319 | Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const |
| 320 | { |
| 321 | /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel |
| 322 | /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels) |
| 323 | /// Note that this does not apply to fusion groups |
| 324 | /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's |
| 325 | /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group |
| 326 | /// INV0: All Fusion groups have a single root |
| 327 | /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph |
| 328 | /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph |
| 329 | /// INV3: All non-roots are Simple kernels |
| 330 | /// INV4: All non roots' dst tensors have the same shape as that of the root kernel |
| 331 | /// INV5: All kernels within a fusion group have the same UnitWorkloadStage |
| 332 | const ClKernelFusionGroup *fg_src {}; |
| 333 | const ClKernelFusionGroup *fg_dst{}; |
| 334 | |
| 335 | // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other |
| 336 | // This guarantess INV0 |
| 337 | // This also finds fg_src (parent / root) and fg_dst (child / non-root) |
| 338 | if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) |
| 339 | { |
| 340 | fg_src = &fg0; |
| 341 | fg_dst = &fg1; |
| 342 | } |
| 343 | else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) |
| 344 | { |
| 345 | fg_src = &fg1; |
| 346 | fg_dst = &fg0; |
| 347 | } |
| 348 | else |
| 349 | { |
| 350 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; |
| 351 | } |
| 352 | |
| 353 | // Find unconnected tensors between fg_src and fg_dst |
| 354 | std::vector<Id> unconnected_tensors{}; |
| 355 | for(const auto &t : fg_dependency.dst_tensors(fg_src->id)) |
| 356 | { |
| 357 | if(!is_in(t, fg_dependency.src_tensors(fg_dst->id))) |
| 358 | { |
| 359 | unconnected_tensors.push_back(t); |
| 360 | } |
| 361 | } |
| 362 | |
| 363 | // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst |
| 364 | // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups |
| 365 | for(const auto &t : unconnected_tensors) |
| 366 | { |
| 367 | if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id)) |
| 368 | { |
| 369 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" }; |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | // Check 2: All non-root fgs are simple. Ensure INV3 |
| 374 | if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple) |
| 375 | { |
| 376 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" }; |
| 377 | } |
| 378 | |
| 379 | // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4 |
| 380 | const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id); |
| 381 | ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor) |
| 382 | const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc; |
| 383 | |
| 384 | for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) |
| 385 | { |
| 386 | const auto t_info = original_graph->get_tensor(t)->desc; |
| 387 | if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0)) |
| 388 | { |
| 389 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" }; |
| 390 | } |
| 391 | } |
| 392 | |
| 393 | // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5 |
| 394 | if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage)) |
| 395 | { |
| 396 | return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" }; |
| 397 | } |
| 398 | |
| 399 | return Status{}; |
| 400 | } |
| 401 | |
| 402 | const ClKernelGraph *original_graph{}; |
| 403 | DependencyGraph fg_dependency{}; |
| 404 | KernelFusionGroupMap fusion_groups{}; |
| 405 | // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups. |
| 406 | |
| 407 | private: |
| 408 | Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors) |
| 409 | { |
| 410 | for(auto t : src_tensors) |
| 411 | { |
| 412 | fg_dependency.link_input(fg, t); |
| 413 | } |
| 414 | return Status{}; |
| 415 | } |
| 416 | Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors) |
| 417 | { |
| 418 | for(auto t : dst_tensors) |
| 419 | { |
| 420 | fg_dependency.link_output(fg, t); |
| 421 | } |
| 422 | return Status{}; |
| 423 | } |
| 424 | Status remove_fg(Id fg) |
| 425 | { |
| 426 | fg_dependency.remove_operator(fg); |
| 427 | fusion_groups.erase(fg); |
| 428 | return Status{}; |
| 429 | } |
| 430 | Status remove_fg_tensor(Id tensor) |
| 431 | { |
| 432 | fg_dependency.remove_tensor(tensor); |
| 433 | return Status{}; |
| 434 | } |
| 435 | }; |
| 436 | |
| 437 | std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph); |
| 438 | std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph); |
| 439 | |
| 440 | std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph); |
| 441 | |
| 442 | Status fuse(ClFusedKernelGraph &fused_kernel_graph); |
| 443 | |
| 444 | Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg); |
| 445 | |
| 446 | Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph); |
| 447 | |
| 448 | } // namespace dynamic_fusion |
| 449 | } // namespace experimental |
| 450 | } // namespace arm_compute |
SiCong Li | 4e9f568 | 2022-05-10 10:15:59 +0100 | [diff] [blame^] | 451 | #endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H |
| 452 | #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ |