Blame - arm_compute/core/experimental/ClWorkload.h - ml/ComputeLibrary

blob: 9b2040a046266489143b4b3f27ccb18adb48627b [file] [log] [blame]

SiCong Li	b63b119	2022-01-28 18:24:39 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
SiCong Li	4e9f568	2022-05-10 10:15:59 +0100	[diff] [blame]	24	#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
SiCong Li	b63b119	2022-01-28 18:24:39 +0000	[diff] [blame]	25	#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
				26	#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
				27
				28	#include "arm_compute/core/CL/CLCompileContext.h"
				29	#include "arm_compute/core/GPUTarget.h"
				30	#include "arm_compute/core/Window.h"
				31
				32	#include "arm_compute/core/experimental/IWorkload.h"
				33	#include "arm_compute/core/experimental/OperatorGraph.h"
				34
				35	#include <map>
				36
				37	namespace arm_compute
				38	{
				39	namespace experimental
				40	{
				41	namespace dynamic_fusion
				42	{
				43	/** Verbose and explicit way to enumerate all the tensor arguments variants used by
				44	* all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
				45	*/
				46	enum class ClKernelTensorArgType : int
				47	{
				48	Scalar,
				49
				50	Vector,
				51
				52	Image,
				53	Image_Reinterpret_As_3D,
				54	Image_Export_To_ClImage2D,
				55
				56	Image_3D, // 3D Tensor represented as a 2D Image + stride_z
				57	Image_3D_Export_To_ClImage2D,
				58
				59	Tensor_3D,
				60	Tensor_4D,
				61	Tensor_4D_t_Buffer,
				62	Tensor_4D_t_Image
				63	};
				64
				65	/** Describes all the info required to add a kernel argument at run time
				66	*
				67	* @note This struct can later be expanded into a more concise and formal way to specify how to set up
				68	* arguments for a kernel inside a @ref ClUnitWorkload
				69	*/
				70	struct ClKernelArgDescriptor
				71	{
				72	ClKernelArgDescriptor() = default;
				73	ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
				74	: arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
				75	{
				76	}
				77	~ClKernelArgDescriptor() = default;
				78	friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
				79	{
				80	return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
				81	}
				82	int arg_id{ -1 }; /*< Arg ID in the blueprint, -1 means empty / uninitialized /
				83	ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /*< tensor argument type /
				84	bool slide_along_dimz{ true }; /*< @note slide_along_dimz will be moved out of this descriptor in later iterations /
				85	};
				86
				87	using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
				88
				89	/** Descriptor containing information required to run a single ClWorkload
				90	*/
				91	struct ClExecutionDescriptor
				92	{
				93	cl::NDRange suggested_lws{}; /*< Suggested local work-group size for optimal performance if not zero /
				94	cl::NDRange gws{}; /*< Global work-group to be used /
				95	bool skip_sliding_window{ false }; /*< Skip sliding window slices during execution loop /
				96	};
				97
				98	/** Contains kernel code to be compiled and run in a ClUnitWorkload
				99	*/
				100	struct ClKernelCode
				101	{
				102	friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
				103	{
				104	return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
				105	&& (code0.arguments == code1.arguments);
				106	}
				107	std::string name{}; /*< Kernel name /
				108	std::string code{}; /*< Kernel source code /
				109	std::string config_id{}; /*< Generated from blueprint based on complex component /
				110	CLBuildOptions build_options{}; /*< Kernel build options /
				111	Window window{}; /*< Execution window /
				112	ClKernelArgList arguments{}; /*< Kernel argument descriptors. map key is kernel ArgumentID /
				113	};
				114
				115	/** A descriptor of ClWorkload Tensors.
				116	*/
				117	struct ClWorkloadTensor : public WorkloadTensor
				118	{
				119	ClWorkloadTensor() = default;
				120	ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
				121	: WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
				122	{
				123	}
				124	ClKernelArgDescriptor kernel_arg{};
				125	friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
				126	{
				127	return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
				128	}
				129	};
				130
				131	/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
				132	*/
				133	struct ClUnitWorkload : public UnitWorkload
				134	{
				135	ClUnitWorkload() = default;
				136	ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
				137	: UnitWorkload{ id, stage }, code{ code }
				138	{
				139	}
				140	friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
				141	{
				142	return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
				143	}
				144	ClKernelCode code{};
				145	};
				146
				147	/** GPU information for @ref ClWorkloadContext
				148	*/
				149	struct GpuInfo
				150	{
				151	friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
				152	{
				153	return info0.target == info1.target;
				154	}
				155	GPUTarget target{ GPUTarget::UNKNOWN };
				156	};
				157
				158	/** Context (device capabilities, platform details) associated with a ClWorkload
				159	*
				160	* It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
				161	*/
				162	struct ClWorkloadContext
				163	{
				164	friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
				165	{
				166	return ctx0.gpu_info == ctx1.gpu_info;
				167	}
				168	GpuInfo gpu_info{};
				169	};
				170
				171	/** Workload for Cl backend
				172	*/
				173	struct ClWorkload : public IWorkload
				174	{
				175	Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
				176	{
				177	Tid id = graph.add_tensor(merge_point);
				178	if(tensors.find(id) == tensors.end())
				179	{
				180	tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
				181	}
				182	return id;
				183	}
				184	UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
				185	{
				186	auto op = graph.add_operator(inputs, outputs);
				187	auto id = op.second;
				188	unit_workloads[id] = ClUnitWorkload(id, stage, code);
				189	return id;
				190	}
				191	friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
				192	{
				193	return std::make_tuple(
				194	workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
				195	== std::make_tuple(
				196	workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
				197	}
				198	ClWorkloadContext context{}; /*< Workload context/
				199	std::map<UnitWorkId, ClUnitWorkload> unit_workloads{}; /*< Unit workloads to run/
				200	std::map<Tid, ClWorkloadTensor> tensors{}; /*< Workload tensors/
				201	std::map<Tid, OpTensor::Id> op_tensor_id_lut{}; /*< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)/
				202	Status status{}; /*< For compatibility with the IOperator validate method. Store if the workload is valid or not. /
				203	};
				204
				205	/** Build a @ref ClWorkload from an @ref OperatorGraph.
				206	*
				207	* @param[out] workload
				208	* @param[in] op_graph
				209	* @param[in] ctx
				210	* @return Status
				211	*/
				212	Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
				213
				214	} // namespace dynamic_fusion
				215	} // namespace experimental
				216	} // namespace arm_compute
				217
SiCong Li	4e9f568	2022-05-10 10:15:59 +0100	[diff] [blame]	218	#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
				219	#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */