Blame - src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp - ml/ComputeLibrary

blob: cd21b10180fc593cf5cf7766ff1cc35d849d5955 [file] [log] [blame]

SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	1	/*
Ramy Elgammal	002e653	2023-01-11 18:48:04 +0000	[diff] [blame]	2	* Copyright (c) 2022-2023 Arm Limited.
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
				25
				26	#include "arm_compute/core/experimental/Types.h"
				27	#include "arm_compute/runtime/CL/CLTensor.h"
				28	#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
				29	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
				30	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
				31	#include "support/Cast.h"
				32
				33	#include <algorithm>
				34
				35	namespace arm_compute
				36	{
				37	namespace experimental
				38	{
				39	namespace dynamic_fusion
				40	{
				41	namespace
				42	{
				43	/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
				44	*
				45	* @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
				46	*
				47	* @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
				48	*/
				49	class ClAuxTensors
				50	{
				51	public:
				52	/** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
				53	*/
				54	struct DataView
				55	{
				56	DataView() = default;
				57	DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
				58	: tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
				59	{
				60	}
				61	~DataView() = default;
				62	DataView(const DataView &other) = default;
				63	DataView &operator=(const DataView &other) = default;
				64	DataView(DataView &&other) = default;
				65	DataView &operator=(DataView &&other) = default;
				66	CLTensor tensor{}; /< Pointer to the auxiliary tensor /
				67	TensorInfo tensor_info{}; /*< Associated tensor info /
				68	AuxMemoryInfo memory_info{}; /*< Memory requirement /
				69	};
				70
				71	/** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
				72	std::vector<DataView> get_tensors()
				73	{
				74	return _tensors;
				75	}
				76	std::vector<DataView> get_tensors() const
				77	{
				78	return _tensors;
				79	}
				80
				81	friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
				82
				83	private:
				84	/** Add auxiliary tensor.
				85	*
				86	* @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
				87	* @param[in] memory_info Memory requirements of the auxiliary tensor
				88	*
				89	* @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
				90	*/
				91	CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
				92	{
				93	const auto t_id = tensor_info.id();
				94	auto find_tensor_pair = _owned_tensors.find(t_id);
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame]	95	if(find_tensor_pair != _owned_tensors.end())
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	96	{
				97	return find_tensor_pair->second.get();
				98	}
				99	else
				100	{
				101	auto tensor = std::make_unique<CLTensor>();
				102	auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
				103	auto new_tensor = inserted_pair->second.get();
				104	_tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
				105	return new_tensor;
				106	}
				107	}
				108
				109	std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
				110	std::vector<DataView> _tensors{};
				111	};
				112	/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
				113	*
				114	* @note This is the only recommended method for user to create @ref ClAuxTensors
				115	*
				116	* @param[out] aux_tensors Auxiliary tensors required by the workload code
				117	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				118	*
				119	* @return Status
				120	*/
				121	Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
				122	{
				123	for(auto t_id : code.tensors())
				124	{
				125	// Get tensor object
				126	const auto workload_arg = code.query_tensor(t_id);
				127	ICLTensor *tensor_object = nullptr;
				128	if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
				129	{
				130	// Create aux tensor CLTensor object
				131	const TensorInfo tensor_info = *workload_arg->tensor_info();
				132	ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
				133	const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
				134	tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
Viet-Hoa Do	b84e253	2022-12-13 13:09:10 +0000	[diff] [blame]	135
				136	if(tensor_object == nullptr)
				137	{
				138	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
				139	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	140	}
				141	}
				142	return Status{};
				143	}
				144
				145	/** A fast tensor lookup table for runtime tensor objects retrieval
				146	*/
				147	class ClTensorLUT
				148	{
				149	public:
				150	/** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
				151	*
				152	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				153	*
				154	* @return ITensorPack*
				155	*/
				156	ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
				157	{
				158	auto tensor_pack = _tensor_packs.find(uwk_id);
				159	if(tensor_pack != _tensor_packs.end())
				160	{
				161	return &(tensor_pack->second);
				162	}
				163	return nullptr;
				164	}
				165	/** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
				166	*
				167	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				168	*
				169	* @return ITensorPack*
				170	*/
				171	ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
				172	{
				173	return _tensor_packs.at(uwk_id);
				174	}
				175
				176	friend Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors);
				177
				178	private:
				179	/** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
				180	*
				181	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				182	* @param[in] tensor_pack Tensor pack to be added
				183	*/
				184	void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
				185	{
				186	_tensor_packs[uwk_id] = tensor_pack;
				187	}
				188	std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
				189	};
				190
				191	/** Create a fast tensor lookup table for runtime tensor retrieval
				192	*
				193	* @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
				194	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				195	* @param[in] user_tensors User tensors
				196	* @param[in] aux_tensors Auxiliary tensors required by the workload code
				197	*
				198	* @return Status
				199	*/
				200	Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors)
				201	{
				202	// Combine user tensors and aux tensors
				203	std::map<ITensorInfo::Id, CLTensor *> tensor_map;
				204	for(auto tensor : user_tensors)
				205	{
				206	const auto t_id = tensor->info()->id();
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame]	207
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	208	if(tensor_map.find(t_id) != tensor_map.end())
				209	{
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame]	210	// In case of elementwise in-place: give another Id to the In/Out tensor when passed again
				211	std::vector<ITensorInfo::Id> ids;
				212	for(auto &t : tensor_map)
				213	{
				214	ids.push_back(t.first);
				215	}
				216	ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
				217	tensor_map[new_id] = tensor;
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	218	}
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame]	219	else
				220	{
				221	tensor_map[t_id] = tensor;
				222	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	223	}
				224	for(const auto &data : aux_tensors.get_tensors())
				225	{
				226	const auto t_id = data.tensor_info.id();
				227	const auto tensor = data.tensor;
				228	if(tensor_map.find(t_id) != tensor_map.end())
				229	{
				230	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
				231	}
				232	tensor_map[t_id] = tensor;
				233	}
				234
				235	// Add tensor objects into corresponding tensor packs
				236	for(auto id_tensor : tensor_map)
				237	{
				238	const auto t_id = id_tensor.first;
				239	const auto tensor_object = id_tensor.second;
				240	if(tensor_object == nullptr)
				241	{
				242	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
				243	}
				244	if(tensor_object->allocator()->info().total_size() == 0U)
				245	{
				246	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
				247	}
				248
				249	for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
				250	{
				251	ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
				252	if(tensor_pack == nullptr)
				253	{
				254	tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
				255	}
				256	else
				257	{
				258	tensor_pack->add_tensor(t_id, tensor_object);
				259	}
				260	}
				261	}
Ramy Elgammal	404462a	2022-11-08 02:14:46 +0000	[diff] [blame]	262
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	263	return Status{};
				264	}
				265
				266	} // namespace
				267
				268	struct ClWorkloadRuntime::Implementation
				269	{
				270	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
				271	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
				272	bool _is_configured{ false };
				273	bool _is_prepared{ false };
				274	ClTensorLUT _tensor_lut{};
				275	ClAuxTensors _aux_tensors{};
				276	GpuWorkloadSourceCode _source_code{};
				277	};
				278
				279	ClWorkloadRuntime::ClWorkloadRuntime()
				280	: _impl{ std::make_unique<Implementation>() }
				281	{
				282	}
				283
				284	ClWorkloadRuntime::~ClWorkloadRuntime() = default;
				285
				286	Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
				287	{
				288	ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
				289	ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
				290	// Generate source code
				291	_impl->_source_code = sketch.implementation().generate_source_code();
				292	// Configure unit workload from source code
				293	for(auto uwk_id : _impl->_source_code.unit_workloads())
				294	{
				295	const auto work = _impl->_source_code.query_unit_workload(uwk_id);
				296	const auto stage = work.stage().stage;
				297	auto k = std::make_unique<ClKernelRuntime>();
				298	k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
				299
				300	switch(stage)
				301	{
				302	case UnitWorkloadStage::Stage::Run:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	303	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	304	_impl->_kernels.emplace(work.id(), std::move(k));
				305	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	306	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	307	case UnitWorkloadStage::Stage::Prepare:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	308	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	309	_impl->_kernels_prep.emplace(work.id(), std::move(k));
				310	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	311	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	312	default:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	313	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	314	ARM_COMPUTE_ERROR("Invalid unit workload stage");
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame]	315	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	316	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	317	}
				318	// Create auxiliary tensor objects
				319	create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
				320	_impl->_is_configured = true;
				321	return Status{};
				322	}
				323
				324	void ClWorkloadRuntime::prepare()
				325	{
				326	if(!_impl->_is_prepared)
				327	{
				328	for(auto &id_kernel_pair : _impl->_kernels_prep)
				329	{
				330	const bool flush_queue = false;
				331	const auto uwk_id = id_kernel_pair.first;
				332	auto kernel = id_kernel_pair.second.get();
				333	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				334	}
				335
				336	_impl->_is_prepared = true;
				337	}
				338	}
				339
				340	Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
				341	{
				342	// Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
				343	// in which case the lut can be cached during prepare
				344	const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
				345	ARM_COMPUTE_RETURN_ON_ERROR(st);
				346	prepare();
				347	for(auto &id_kernel_pair : _impl->_kernels)
				348	{
				349	// Flush the command queue on the last kernel
				350	const bool flush_queue = false;
				351	const auto uwk_id = id_kernel_pair.first;
				352	auto kernel = id_kernel_pair.second.get();
				353	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				354	}
				355	return Status{};
				356	}
				357
Ramy Elgammal	002e653	2023-01-11 18:48:04 +0000	[diff] [blame]	358	std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	359	{
Ramy Elgammal	002e653	2023-01-11 18:48:04 +0000	[diff] [blame]	360	std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	361	for(const auto &data : _impl->_aux_tensors.get_tensors())
				362	{
Ramy Elgammal	002e653	2023-01-11 18:48:04 +0000	[diff] [blame]	363	aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	364	}
				365	return aux_tensors;
				366	}
				367	} // namespace dynamic_fusion
				368	} // namespace experimental
				369	} // namespace arm_compute