Blame - src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp - ml/ComputeLibrary

blob: 36168d14f10dd537d452bd30cff6d39f64ebba06 [file] [log] [blame]

SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
				25
				26	#include "arm_compute/core/experimental/Types.h"
				27	#include "arm_compute/runtime/CL/CLTensor.h"
				28	#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
				29	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
				30	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
				31	#include "support/Cast.h"
				32
				33	#include <algorithm>
				34
				35	namespace arm_compute
				36	{
				37	namespace experimental
				38	{
				39	namespace dynamic_fusion
				40	{
				41	namespace
				42	{
				43	/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
				44	*
				45	* @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
				46	*
				47	* @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
				48	*/
				49	class ClAuxTensors
				50	{
				51	public:
				52	/** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
				53	*/
				54	struct DataView
				55	{
				56	DataView() = default;
				57	DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
				58	: tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
				59	{
				60	}
				61	~DataView() = default;
				62	DataView(const DataView &other) = default;
				63	DataView &operator=(const DataView &other) = default;
				64	DataView(DataView &&other) = default;
				65	DataView &operator=(DataView &&other) = default;
				66	CLTensor tensor{}; /< Pointer to the auxiliary tensor /
				67	TensorInfo tensor_info{}; /*< Associated tensor info /
				68	AuxMemoryInfo memory_info{}; /*< Memory requirement /
				69	};
				70
				71	/** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
				72	std::vector<DataView> get_tensors()
				73	{
				74	return _tensors;
				75	}
				76	std::vector<DataView> get_tensors() const
				77	{
				78	return _tensors;
				79	}
				80
				81	friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
				82
				83	private:
				84	/** Add auxiliary tensor.
				85	*
				86	* @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
				87	* @param[in] memory_info Memory requirements of the auxiliary tensor
				88	*
				89	* @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
				90	*/
				91	CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
				92	{
				93	const auto t_id = tensor_info.id();
				94	auto find_tensor_pair = _owned_tensors.find(t_id);
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame^]	95	if(find_tensor_pair != _owned_tensors.end())
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	96	{
				97	return find_tensor_pair->second.get();
				98	}
				99	else
				100	{
				101	auto tensor = std::make_unique<CLTensor>();
				102	auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
				103	auto new_tensor = inserted_pair->second.get();
				104	_tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
				105	return new_tensor;
				106	}
				107	}
				108
				109	std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
				110	std::vector<DataView> _tensors{};
				111	};
				112	/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
				113	*
				114	* @note This is the only recommended method for user to create @ref ClAuxTensors
				115	*
				116	* @param[out] aux_tensors Auxiliary tensors required by the workload code
				117	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				118	*
				119	* @return Status
				120	*/
				121	Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
				122	{
				123	for(auto t_id : code.tensors())
				124	{
				125	// Get tensor object
				126	const auto workload_arg = code.query_tensor(t_id);
				127	ICLTensor *tensor_object = nullptr;
				128	if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
				129	{
				130	// Create aux tensor CLTensor object
				131	const TensorInfo tensor_info = *workload_arg->tensor_info();
				132	ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
				133	const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
				134	tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
				135	}
				136	if(tensor_object == nullptr)
				137	{
				138	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
				139	}
				140	}
				141	return Status{};
				142	}
				143
				144	/** A fast tensor lookup table for runtime tensor objects retrieval
				145	*/
				146	class ClTensorLUT
				147	{
				148	public:
				149	/** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
				150	*
				151	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				152	*
				153	* @return ITensorPack*
				154	*/
				155	ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
				156	{
				157	auto tensor_pack = _tensor_packs.find(uwk_id);
				158	if(tensor_pack != _tensor_packs.end())
				159	{
				160	return &(tensor_pack->second);
				161	}
				162	return nullptr;
				163	}
				164	/** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
				165	*
				166	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				167	*
				168	* @return ITensorPack*
				169	*/
				170	ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
				171	{
				172	return _tensor_packs.at(uwk_id);
				173	}
				174
				175	friend Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors);
				176
				177	private:
				178	/** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
				179	*
				180	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				181	* @param[in] tensor_pack Tensor pack to be added
				182	*/
				183	void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
				184	{
				185	_tensor_packs[uwk_id] = tensor_pack;
				186	}
				187	std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
				188	};
				189
				190	/** Create a fast tensor lookup table for runtime tensor retrieval
				191	*
				192	* @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
				193	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				194	* @param[in] user_tensors User tensors
				195	* @param[in] aux_tensors Auxiliary tensors required by the workload code
				196	*
				197	* @return Status
				198	*/
				199	Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors)
				200	{
				201	// Combine user tensors and aux tensors
				202	std::map<ITensorInfo::Id, CLTensor *> tensor_map;
				203	for(auto tensor : user_tensors)
				204	{
				205	const auto t_id = tensor->info()->id();
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame^]	206
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	207	if(tensor_map.find(t_id) != tensor_map.end())
				208	{
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame^]	209	// In case of elementwise in-place: give another Id to the In/Out tensor when passed again
				210	std::vector<ITensorInfo::Id> ids;
				211	for(auto &t : tensor_map)
				212	{
				213	ids.push_back(t.first);
				214	}
				215	ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
				216	tensor_map[new_id] = tensor;
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	217	}
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame^]	218	else
				219	{
				220	tensor_map[t_id] = tensor;
				221	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	222	}
				223	for(const auto &data : aux_tensors.get_tensors())
				224	{
				225	const auto t_id = data.tensor_info.id();
				226	const auto tensor = data.tensor;
				227	if(tensor_map.find(t_id) != tensor_map.end())
				228	{
				229	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
				230	}
				231	tensor_map[t_id] = tensor;
				232	}
				233
				234	// Add tensor objects into corresponding tensor packs
				235	for(auto id_tensor : tensor_map)
				236	{
				237	const auto t_id = id_tensor.first;
				238	const auto tensor_object = id_tensor.second;
				239	if(tensor_object == nullptr)
				240	{
				241	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
				242	}
				243	if(tensor_object->allocator()->info().total_size() == 0U)
				244	{
				245	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
				246	}
				247
				248	for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
				249	{
				250	ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
				251	if(tensor_pack == nullptr)
				252	{
				253	tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
				254	}
				255	else
				256	{
				257	tensor_pack->add_tensor(t_id, tensor_object);
				258	}
				259	}
				260	}
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame^]	261
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	262	return Status{};
				263	}
				264
				265	} // namespace
				266
				267	struct ClWorkloadRuntime::Implementation
				268	{
				269	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
				270	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
				271	bool _is_configured{ false };
				272	bool _is_prepared{ false };
				273	ClTensorLUT _tensor_lut{};
				274	ClAuxTensors _aux_tensors{};
				275	GpuWorkloadSourceCode _source_code{};
				276	};
				277
				278	ClWorkloadRuntime::ClWorkloadRuntime()
				279	: _impl{ std::make_unique<Implementation>() }
				280	{
				281	}
				282
				283	ClWorkloadRuntime::~ClWorkloadRuntime() = default;
				284
				285	Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
				286	{
				287	ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
				288	ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
				289	// Generate source code
				290	_impl->_source_code = sketch.implementation().generate_source_code();
				291	// Configure unit workload from source code
				292	for(auto uwk_id : _impl->_source_code.unit_workloads())
				293	{
				294	const auto work = _impl->_source_code.query_unit_workload(uwk_id);
				295	const auto stage = work.stage().stage;
				296	auto k = std::make_unique<ClKernelRuntime>();
				297	k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
				298
				299	switch(stage)
				300	{
				301	case UnitWorkloadStage::Stage::Run:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	302	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	303	_impl->_kernels.emplace(work.id(), std::move(k));
				304	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	305	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	306	case UnitWorkloadStage::Stage::Prepare:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	307	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	308	_impl->_kernels_prep.emplace(work.id(), std::move(k));
				309	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	310	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	311	default:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	312	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	313	ARM_COMPUTE_ERROR("Invalid unit workload stage");
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	314	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	315	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	316	}
				317	// Create auxiliary tensor objects
				318	create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
				319	_impl->_is_configured = true;
				320	return Status{};
				321	}
				322
				323	void ClWorkloadRuntime::prepare()
				324	{
				325	if(!_impl->_is_prepared)
				326	{
				327	for(auto &id_kernel_pair : _impl->_kernels_prep)
				328	{
				329	const bool flush_queue = false;
				330	const auto uwk_id = id_kernel_pair.first;
				331	auto kernel = id_kernel_pair.second.get();
				332	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				333	}
				334
				335	_impl->_is_prepared = true;
				336	}
				337	}
				338
				339	Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
				340	{
				341	// Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
				342	// in which case the lut can be cached during prepare
				343	const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
				344	ARM_COMPUTE_RETURN_ON_ERROR(st);
				345	prepare();
				346	for(auto &id_kernel_pair : _impl->_kernels)
				347	{
				348	// Flush the command queue on the last kernel
				349	const bool flush_queue = false;
				350	const auto uwk_id = id_kernel_pair.first;
				351	auto kernel = id_kernel_pair.second.get();
				352	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				353	}
				354	return Status{};
				355	}
				356
				357	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
				358	{
				359	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
				360	for(const auto &data : _impl->_aux_tensors.get_tensors())
				361	{
				362	aux_tensors.emplace_back(data.tensor, data.memory_info);
				363	}
				364	return aux_tensors;
				365	}
				366	} // namespace dynamic_fusion
				367	} // namespace experimental
				368	} // namespace arm_compute