blob: 9b2040a046266489143b4b3f27ccb18adb48627b [file] [log] [blame]
SiCong Lib63b1192022-01-28 18:24:39 +00001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
SiCong Li4e9f5682022-05-10 10:15:59 +010024#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
SiCong Lib63b1192022-01-28 18:24:39 +000025#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
26#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
27
28#include "arm_compute/core/CL/CLCompileContext.h"
29#include "arm_compute/core/GPUTarget.h"
30#include "arm_compute/core/Window.h"
31
32#include "arm_compute/core/experimental/IWorkload.h"
33#include "arm_compute/core/experimental/OperatorGraph.h"
34
35#include <map>
36
37namespace arm_compute
38{
39namespace experimental
40{
41namespace dynamic_fusion
42{
43/** Verbose and explicit way to enumerate all the tensor arguments variants used by
44 * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
45 */
46enum class ClKernelTensorArgType : int
47{
48 Scalar,
49
50 Vector,
51
52 Image,
53 Image_Reinterpret_As_3D,
54 Image_Export_To_ClImage2D,
55
56 Image_3D, // 3D Tensor represented as a 2D Image + stride_z
57 Image_3D_Export_To_ClImage2D,
58
59 Tensor_3D,
60 Tensor_4D,
61 Tensor_4D_t_Buffer,
62 Tensor_4D_t_Image
63};
64
65/** Describes all the info required to add a kernel argument at run time
66 *
67 * @note This struct can later be expanded into a more concise and formal way to specify how to set up
68 * arguments for a kernel inside a @ref ClUnitWorkload
69 */
70struct ClKernelArgDescriptor
71{
72 ClKernelArgDescriptor() = default;
73 ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
74 : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
75 {
76 }
77 ~ClKernelArgDescriptor() = default;
78 friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
79 {
80 return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
81 }
82 int arg_id{ -1 }; /**< Arg ID in the blueprint, -1 means empty / uninitialized */
83 ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
84 bool slide_along_dimz{ true }; /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
85};
86
87using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
88
89/** Descriptor containing information required to run a single ClWorkload
90 */
91struct ClExecutionDescriptor
92{
93 cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
94 cl::NDRange gws{}; /**< Global work-group to be used */
95 bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
96};
97
98/** Contains kernel code to be compiled and run in a ClUnitWorkload
99 */
100struct ClKernelCode
101{
102 friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
103 {
104 return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
105 && (code0.arguments == code1.arguments);
106 }
107 std::string name{}; /**< Kernel name */
108 std::string code{}; /**< Kernel source code */
109 std::string config_id{}; /**< Generated from blueprint based on complex component */
110 CLBuildOptions build_options{}; /**< Kernel build options */
111 Window window{}; /**< Execution window */
112 ClKernelArgList arguments{}; /**< Kernel argument descriptors. map key is kernel ArgumentID */
113};
114
115/** A descriptor of ClWorkload Tensors.
116 */
117struct ClWorkloadTensor : public WorkloadTensor
118{
119 ClWorkloadTensor() = default;
120 ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
121 : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
122 {
123 }
124 ClKernelArgDescriptor kernel_arg{};
125 friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
126 {
127 return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
128 }
129};
130
131/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
132 */
133struct ClUnitWorkload : public UnitWorkload
134{
135 ClUnitWorkload() = default;
136 ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
137 : UnitWorkload{ id, stage }, code{ code }
138 {
139 }
140 friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
141 {
142 return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
143 }
144 ClKernelCode code{};
145};
146
147/** GPU information for @ref ClWorkloadContext
148 */
149struct GpuInfo
150{
151 friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
152 {
153 return info0.target == info1.target;
154 }
155 GPUTarget target{ GPUTarget::UNKNOWN };
156};
157
158/** Context (device capabilities, platform details) associated with a ClWorkload
159 *
160 * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
161 */
162struct ClWorkloadContext
163{
164 friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
165 {
166 return ctx0.gpu_info == ctx1.gpu_info;
167 }
168 GpuInfo gpu_info{};
169};
170
171/** Workload for Cl backend
172 */
173struct ClWorkload : public IWorkload
174{
175 Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
176 {
177 Tid id = graph.add_tensor(merge_point);
178 if(tensors.find(id) == tensors.end())
179 {
180 tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
181 }
182 return id;
183 }
184 UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
185 {
186 auto op = graph.add_operator(inputs, outputs);
187 auto id = op.second;
188 unit_workloads[id] = ClUnitWorkload(id, stage, code);
189 return id;
190 }
191 friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
192 {
193 return std::make_tuple(
194 workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
195 == std::make_tuple(
196 workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
197 }
198 ClWorkloadContext context{}; /**< Workload context*/
199 std::map<UnitWorkId, ClUnitWorkload> unit_workloads{}; /**< Unit workloads to run*/
200 std::map<Tid, ClWorkloadTensor> tensors{}; /**< Workload tensors*/
201 std::map<Tid, OpTensor::Id> op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
202 Status status{}; /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
203};
204
205/** Build a @ref ClWorkload from an @ref OperatorGraph.
206 *
207 * @param[out] workload
208 * @param[in] op_graph
209 * @param[in] ctx
210 * @return Status
211 */
212Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
213
214} // namespace dynamic_fusion
215} // namespace experimental
216} // namespace arm_compute
217
SiCong Li4e9f5682022-05-10 10:15:59 +0100218#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
219#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */