Blame - src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp - ml/ComputeLibrary

blob: f14f66d1bdb1ba1a7a6efa674ed3e43f3b443113 [file] [log] [blame]

SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
				25
				26	#include "arm_compute/core/experimental/Types.h"
				27	#include "arm_compute/runtime/CL/CLTensor.h"
				28	#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
				29	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
				30	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
				31	#include "support/Cast.h"
				32
				33	#include <algorithm>
				34
				35	namespace arm_compute
				36	{
				37	namespace experimental
				38	{
				39	namespace dynamic_fusion
				40	{
				41	namespace
				42	{
				43	/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
				44	*
				45	* @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
				46	*
				47	* @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
				48	*/
				49	class ClAuxTensors
				50	{
				51	public:
				52	/** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
				53	*/
				54	struct DataView
				55	{
				56	DataView() = default;
				57	DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
				58	: tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
				59	{
				60	}
				61	~DataView() = default;
				62	DataView(const DataView &other) = default;
				63	DataView &operator=(const DataView &other) = default;
				64	DataView(DataView &&other) = default;
				65	DataView &operator=(DataView &&other) = default;
				66	CLTensor tensor{}; /< Pointer to the auxiliary tensor /
				67	TensorInfo tensor_info{}; /*< Associated tensor info /
				68	AuxMemoryInfo memory_info{}; /*< Memory requirement /
				69	};
				70
				71	/** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
				72	std::vector<DataView> get_tensors()
				73	{
				74	return _tensors;
				75	}
				76	std::vector<DataView> get_tensors() const
				77	{
				78	return _tensors;
				79	}
				80
				81	friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
				82
				83	private:
				84	/** Add auxiliary tensor.
				85	*
				86	* @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
				87	* @param[in] memory_info Memory requirements of the auxiliary tensor
				88	*
				89	* @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
				90	*/
				91	CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
				92	{
				93	const auto t_id = tensor_info.id();
				94	auto find_tensor_pair = _owned_tensors.find(t_id);
				95	if(find_tensor_pair == _owned_tensors.end())
				96	{
				97	return find_tensor_pair->second.get();
				98	}
				99	else
				100	{
				101	auto tensor = std::make_unique<CLTensor>();
				102	auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
				103	auto new_tensor = inserted_pair->second.get();
				104	_tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
				105	return new_tensor;
				106	}
				107	}
				108
				109	std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
				110	std::vector<DataView> _tensors{};
				111	};
				112	/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
				113	*
				114	* @note This is the only recommended method for user to create @ref ClAuxTensors
				115	*
				116	* @param[out] aux_tensors Auxiliary tensors required by the workload code
				117	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				118	*
				119	* @return Status
				120	*/
				121	Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
				122	{
				123	for(auto t_id : code.tensors())
				124	{
				125	// Get tensor object
				126	const auto workload_arg = code.query_tensor(t_id);
				127	ICLTensor *tensor_object = nullptr;
				128	if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
				129	{
				130	// Create aux tensor CLTensor object
				131	const TensorInfo tensor_info = *workload_arg->tensor_info();
				132	ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
				133	const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
				134	tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
				135	}
				136	if(tensor_object == nullptr)
				137	{
				138	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
				139	}
				140	}
				141	return Status{};
				142	}
				143
				144	/** A fast tensor lookup table for runtime tensor objects retrieval
				145	*/
				146	class ClTensorLUT
				147	{
				148	public:
				149	/** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
				150	*
				151	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				152	*
				153	* @return ITensorPack*
				154	*/
				155	ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
				156	{
				157	auto tensor_pack = _tensor_packs.find(uwk_id);
				158	if(tensor_pack != _tensor_packs.end())
				159	{
				160	return &(tensor_pack->second);
				161	}
				162	return nullptr;
				163	}
				164	/** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
				165	*
				166	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				167	*
				168	* @return ITensorPack*
				169	*/
				170	ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
				171	{
				172	return _tensor_packs.at(uwk_id);
				173	}
				174
				175	friend Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors);
				176
				177	private:
				178	/** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
				179	*
				180	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
				181	* @param[in] tensor_pack Tensor pack to be added
				182	*/
				183	void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
				184	{
				185	_tensor_packs[uwk_id] = tensor_pack;
				186	}
				187	std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
				188	};
				189
				190	/** Create a fast tensor lookup table for runtime tensor retrieval
				191	*
				192	* @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
				193	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
				194	* @param[in] user_tensors User tensors
				195	* @param[in] aux_tensors Auxiliary tensors required by the workload code
				196	*
				197	* @return Status
				198	*/
				199	Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors)
				200	{
				201	// Combine user tensors and aux tensors
				202	std::map<ITensorInfo::Id, CLTensor *> tensor_map;
				203	for(auto tensor : user_tensors)
				204	{
				205	const auto t_id = tensor->info()->id();
				206	if(tensor_map.find(t_id) != tensor_map.end())
				207	{
				208	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
				209	}
				210	tensor_map[t_id] = tensor;
				211	}
				212	for(const auto &data : aux_tensors.get_tensors())
				213	{
				214	const auto t_id = data.tensor_info.id();
				215	const auto tensor = data.tensor;
				216	if(tensor_map.find(t_id) != tensor_map.end())
				217	{
				218	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
				219	}
				220	tensor_map[t_id] = tensor;
				221	}
				222
				223	// Add tensor objects into corresponding tensor packs
				224	for(auto id_tensor : tensor_map)
				225	{
				226	const auto t_id = id_tensor.first;
				227	const auto tensor_object = id_tensor.second;
				228	if(tensor_object == nullptr)
				229	{
				230	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
				231	}
				232	if(tensor_object->allocator()->info().total_size() == 0U)
				233	{
				234	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
				235	}
				236
				237	for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
				238	{
				239	ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
				240	if(tensor_pack == nullptr)
				241	{
				242	tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
				243	}
				244	else
				245	{
				246	tensor_pack->add_tensor(t_id, tensor_object);
				247	}
				248	}
				249	}
				250	return Status{};
				251	}
				252
				253	} // namespace
				254
				255	struct ClWorkloadRuntime::Implementation
				256	{
				257	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
				258	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
				259	bool _is_configured{ false };
				260	bool _is_prepared{ false };
				261	ClTensorLUT _tensor_lut{};
				262	ClAuxTensors _aux_tensors{};
				263	GpuWorkloadSourceCode _source_code{};
				264	};
				265
				266	ClWorkloadRuntime::ClWorkloadRuntime()
				267	: _impl{ std::make_unique<Implementation>() }
				268	{
				269	}
				270
				271	ClWorkloadRuntime::~ClWorkloadRuntime() = default;
				272
				273	Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
				274	{
				275	ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
				276	ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
				277	// Generate source code
				278	_impl->_source_code = sketch.implementation().generate_source_code();
				279	// Configure unit workload from source code
				280	for(auto uwk_id : _impl->_source_code.unit_workloads())
				281	{
				282	const auto work = _impl->_source_code.query_unit_workload(uwk_id);
				283	const auto stage = work.stage().stage;
				284	auto k = std::make_unique<ClKernelRuntime>();
				285	k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
				286
				287	switch(stage)
				288	{
				289	case UnitWorkloadStage::Stage::Run:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	290	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	291	_impl->_kernels.emplace(work.id(), std::move(k));
				292	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	293	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	294	case UnitWorkloadStage::Stage::Prepare:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	295	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	296	_impl->_kernels_prep.emplace(work.id(), std::move(k));
				297	break;
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	298	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	299	default:
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	300	{
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	301	ARM_COMPUTE_ERROR("Invalid unit workload stage");
SiCong Li	a2b131b	2022-11-04 10:11:32 +0000	[diff] [blame^]	302	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	303	}
SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame]	304	}
				305	// Create auxiliary tensor objects
				306	create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
				307	_impl->_is_configured = true;
				308	return Status{};
				309	}
				310
				311	void ClWorkloadRuntime::prepare()
				312	{
				313	if(!_impl->_is_prepared)
				314	{
				315	for(auto &id_kernel_pair : _impl->_kernels_prep)
				316	{
				317	const bool flush_queue = false;
				318	const auto uwk_id = id_kernel_pair.first;
				319	auto kernel = id_kernel_pair.second.get();
				320	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				321	}
				322
				323	_impl->_is_prepared = true;
				324	}
				325	}
				326
				327	Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
				328	{
				329	// Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
				330	// in which case the lut can be cached during prepare
				331	const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
				332	ARM_COMPUTE_RETURN_ON_ERROR(st);
				333	prepare();
				334	for(auto &id_kernel_pair : _impl->_kernels)
				335	{
				336	// Flush the command queue on the last kernel
				337	const bool flush_queue = false;
				338	const auto uwk_id = id_kernel_pair.first;
				339	auto kernel = id_kernel_pair.second.get();
				340	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
				341	}
				342	return Status{};
				343	}
				344
				345	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
				346	{
				347	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
				348	for(const auto &data : _impl->_aux_tensors.get_tensors())
				349	{
				350	aux_tensors.emplace_back(data.tensor, data.memory_info);
				351	}
				352	return aux_tensors;
				353	}
				354	} // namespace dynamic_fusion
				355	} // namespace experimental
				356	} // namespace arm_compute