src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"

 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 #include "support/Cast.h"

 #include <algorithm>

 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
 namespace
 {
 /** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
  *
  * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
  *
  * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
  */
 class ClAuxTensors
 {
 public:
     /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
      */
     struct DataView
     {
         DataView() = default;
         DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
             : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
         {
         }
         ~DataView()                     = default;
         DataView(const DataView &other) = default;
         DataView &operator=(const DataView &other) = default;
         DataView(DataView &&other)                 = default;
         DataView &operator=(DataView &&other) = default;
         CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
         TensorInfo    tensor_info{}; /**< Associated tensor info */
         AuxMemoryInfo memory_info{}; /**< Memory requirement */
     };

     /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
     std::vector<DataView> get_tensors()
     {
         return _tensors;
     }
     std::vector<DataView> get_tensors() const
     {
         return _tensors;
     }

     friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);

 private:
     /** Add auxiliary tensor.
      *
      * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
      * @param[in] memory_info Memory requirements of the auxiliary tensor
      *
      * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
      */
     CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
     {
         const auto t_id             = tensor_info.id();
         auto       find_tensor_pair = _owned_tensors.find(t_id);
         if(find_tensor_pair == _owned_tensors.end())
         {
             return find_tensor_pair->second.get();
         }
         else
         {
             auto tensor        = std::make_unique<CLTensor>();
             auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
             auto new_tensor    = inserted_pair->second.get();
             _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
             return new_tensor;
         }
     }

     std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
     std::vector<DataView> _tensors{};
 };
 /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
  *
  * @note This is the only recommended method for user to create @ref ClAuxTensors
  *
  * @param[out] aux_tensors Auxiliary tensors required by the workload code
  * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
  *
  * @return Status
  */
 Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
 {
     for(auto t_id : code.tensors())
     {
         // Get tensor object
         const auto workload_arg  = code.query_tensor(t_id);
         ICLTensor *tensor_object = nullptr;
         if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
         {
             // Create aux tensor CLTensor object
             const TensorInfo tensor_info = *workload_arg->tensor_info();
             ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
             const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
             tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
         }
         if(tensor_object == nullptr)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
         }
     }
     return Status{};
 }

 /** A fast tensor lookup table for runtime tensor objects retrieval
  */
 class ClTensorLUT
 {
 public:
     /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
      *
      * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
      *
      * @return ITensorPack*
      */
     ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
     {
         auto tensor_pack = _tensor_packs.find(uwk_id);
         if(tensor_pack != _tensor_packs.end())
         {
             return &(tensor_pack->second);
         }
         return nullptr;
     }
     /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
      *
      * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
      *
      * @return ITensorPack*
      */
     ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
     {
         return _tensor_packs.at(uwk_id);
     }

     friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);

 private:
     /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
      *
      * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
      * @param[in] tensor_pack Tensor pack to be added
      */
     void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
     {
         _tensor_packs[uwk_id] = tensor_pack;
     }
     std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
 };

 /** Create a fast tensor lookup table for runtime tensor retrieval
  *
  * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
  * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
  * @param[in]  user_tensors User tensors
  * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
  *
  * @return Status
  */
 Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
 {
     // Combine user tensors and aux tensors
     std::map<ITensorInfo::Id, CLTensor *> tensor_map;
     for(auto tensor : user_tensors)
     {
         const auto t_id = tensor->info()->id();
         if(tensor_map.find(t_id) != tensor_map.end())
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
         }
         tensor_map[t_id] = tensor;
     }
     for(const auto &data : aux_tensors.get_tensors())
     {
         const auto t_id   = data.tensor_info.id();
         const auto tensor = data.tensor;
         if(tensor_map.find(t_id) != tensor_map.end())
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
         }
         tensor_map[t_id] = tensor;
     }

     // Add tensor objects into corresponding tensor packs
     for(auto id_tensor : tensor_map)
     {
         const auto t_id          = id_tensor.first;
         const auto tensor_object = id_tensor.second;
         if(tensor_object == nullptr)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
         }
         if(tensor_object->allocator()->info().total_size() == 0U)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
         }

         for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
         {
             ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
             if(tensor_pack == nullptr)
             {
                 tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
             }
             else
             {
                 tensor_pack->add_tensor(t_id, tensor_object);
             }
         }
     }
     return Status{};
 }

 } // namespace

 struct ClWorkloadRuntime::Implementation
 {
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
     bool                  _is_configured{ false };
     bool                  _is_prepared{ false };
     ClTensorLUT           _tensor_lut{};
     ClAuxTensors          _aux_tensors{};
     GpuWorkloadSourceCode _source_code{};
 };

 ClWorkloadRuntime::ClWorkloadRuntime()
     : _impl{ std::make_unique<Implementation>() }
 {
 }

 ClWorkloadRuntime::~ClWorkloadRuntime() = default;

 Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
     // Generate source code
     _impl->_source_code = sketch.implementation().generate_source_code();
     // Configure unit workload from source code
     for(auto uwk_id : _impl->_source_code.unit_workloads())
     {
         const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
         const auto stage = work.stage().stage;
         auto       k     = std::make_unique<ClKernelRuntime>();
         k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());

         switch(stage)
         {
             case UnitWorkloadStage::Stage::Run:
             {
                 _impl->_kernels.emplace(work.id(), std::move(k));
                 break;
             }
             case UnitWorkloadStage::Stage::Prepare:
             {
                 _impl->_kernels_prep.emplace(work.id(), std::move(k));
                 break;
             }
             default:
             {
                 ARM_COMPUTE_ERROR("Invalid unit workload stage");
             }
         }
     }
     // Create auxiliary tensor objects
     create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
     _impl->_is_configured = true;
     return Status{};
 }

 void ClWorkloadRuntime::prepare()
 {
     if(!_impl->_is_prepared)
     {
         for(auto &id_kernel_pair : _impl->_kernels_prep)
         {
             const bool flush_queue = false;
             const auto uwk_id      = id_kernel_pair.first;
             auto       kernel      = id_kernel_pair.second.get();
             CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
         }

         _impl->_is_prepared = true;
     }
 }

 Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
 {
     // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
     // in which case the lut can be cached during prepare
     const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     prepare();
     for(auto &id_kernel_pair : _impl->_kernels)
     {
         // Flush the command queue on the last kernel
         const bool flush_queue = false;
         const auto uwk_id      = id_kernel_pair.first;
         auto       kernel      = id_kernel_pair.second.get();
         CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
     }
     return Status{};
 }

 std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
 {
     std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
     for(const auto &data : _impl->_aux_tensors.get_tensors())
     {
         aux_tensors.emplace_back(data.tensor, data.memory_info);
     }
     return aux_tensors;
 }
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
	/*
	* Copyright (c) 2022 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"

	#include "arm_compute/core/experimental/Types.h"
	#include "arm_compute/runtime/CL/CLTensor.h"
	#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
	#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
	#include "support/Cast.h"

	#include <algorithm>

	namespace arm_compute
	{
	namespace experimental
	{
	namespace dynamic_fusion
	{
	namespace
	{
	/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
	*
	* @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
	*
	* @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
	*/
	class ClAuxTensors
	{
	public:
	/** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
	*/
	struct DataView
	{
	DataView() = default;
	DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
	: tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
	{
	}
	~DataView() = default;
	DataView(const DataView &other) = default;
	DataView &operator=(const DataView &other) = default;
	DataView(DataView &&other) = default;
	DataView &operator=(DataView &&other) = default;
	CLTensor tensor{}; /< Pointer to the auxiliary tensor /
	TensorInfo tensor_info{}; /*< Associated tensor info /
	AuxMemoryInfo memory_info{}; /*< Memory requirement /
	};

	/** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
	std::vector<DataView> get_tensors()
	{
	return _tensors;
	}
	std::vector<DataView> get_tensors() const
	{
	return _tensors;
	}

	friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);

	private:
	/** Add auxiliary tensor.
	*
	* @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
	* @param[in] memory_info Memory requirements of the auxiliary tensor
	*
	* @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
	*/
	CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
	{
	const auto t_id = tensor_info.id();
	auto find_tensor_pair = _owned_tensors.find(t_id);
	if(find_tensor_pair == _owned_tensors.end())
	{
	return find_tensor_pair->second.get();
	}
	else
	{
	auto tensor = std::make_unique<CLTensor>();
	auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
	auto new_tensor = inserted_pair->second.get();
	_tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
	return new_tensor;
	}
	}

	std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
	std::vector<DataView> _tensors{};
	};
	/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
	*
	* @note This is the only recommended method for user to create @ref ClAuxTensors
	*
	* @param[out] aux_tensors Auxiliary tensors required by the workload code
	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
	*
	* @return Status
	*/
	Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
	{
	for(auto t_id : code.tensors())
	{
	// Get tensor object
	const auto workload_arg = code.query_tensor(t_id);
	ICLTensor *tensor_object = nullptr;
	if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
	{
	// Create aux tensor CLTensor object
	const TensorInfo tensor_info = *workload_arg->tensor_info();
	ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
	const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
	tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
	}
	if(tensor_object == nullptr)
	{
	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
	}
	}
	return Status{};
	}

	/** A fast tensor lookup table for runtime tensor objects retrieval
	*/
	class ClTensorLUT
	{
	public:
	/** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
	*
	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
	*
	* @return ITensorPack*
	*/
	ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
	{
	auto tensor_pack = _tensor_packs.find(uwk_id);
	if(tensor_pack != _tensor_packs.end())
	{
	return &(tensor_pack->second);
	}
	return nullptr;
	}
	/** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
	*
	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
	*
	* @return ITensorPack*
	*/
	ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
	{
	return _tensor_packs.at(uwk_id);
	}

	friend Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors);

	private:
	/** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
	*
	* @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
	* @param[in] tensor_pack Tensor pack to be added
	*/
	void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
	{
	_tensor_packs[uwk_id] = tensor_pack;
	}
	std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
	};

	/** Create a fast tensor lookup table for runtime tensor retrieval
	*
	* @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
	* @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
	* @param[in] user_tensors User tensors
	* @param[in] aux_tensors Auxiliary tensors required by the workload code
	*
	* @return Status
	*/
	Status create_tensor_lut(ClTensorLUT tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor > &user_tensors, const ClAuxTensors &aux_tensors)
	{
	// Combine user tensors and aux tensors
	std::map<ITensorInfo::Id, CLTensor *> tensor_map;
	for(auto tensor : user_tensors)
	{
	const auto t_id = tensor->info()->id();
	if(tensor_map.find(t_id) != tensor_map.end())
	{
	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
	}
	tensor_map[t_id] = tensor;
	}
	for(const auto &data : aux_tensors.get_tensors())
	{
	const auto t_id = data.tensor_info.id();
	const auto tensor = data.tensor;
	if(tensor_map.find(t_id) != tensor_map.end())
	{
	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
	}
	tensor_map[t_id] = tensor;
	}

	// Add tensor objects into corresponding tensor packs
	for(auto id_tensor : tensor_map)
	{
	const auto t_id = id_tensor.first;
	const auto tensor_object = id_tensor.second;
	if(tensor_object == nullptr)
	{
	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
	}
	if(tensor_object->allocator()->info().total_size() == 0U)
	{
	return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
	}

	for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
	{
	ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
	if(tensor_pack == nullptr)
	{
	tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
	}
	else
	{
	tensor_pack->add_tensor(t_id, tensor_object);
	}
	}
	}
	return Status{};
	}

	} // namespace

	struct ClWorkloadRuntime::Implementation
	{
	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
	std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
	bool _is_configured{ false };
	bool _is_prepared{ false };
	ClTensorLUT _tensor_lut{};
	ClAuxTensors _aux_tensors{};
	GpuWorkloadSourceCode _source_code{};
	};

	ClWorkloadRuntime::ClWorkloadRuntime()
	: _impl{ std::make_unique<Implementation>() }
	{
	}

	ClWorkloadRuntime::~ClWorkloadRuntime() = default;

	Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
	{
	ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
	ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
	// Generate source code
	_impl->_source_code = sketch.implementation().generate_source_code();
	// Configure unit workload from source code
	for(auto uwk_id : _impl->_source_code.unit_workloads())
	{
	const auto work = _impl->_source_code.query_unit_workload(uwk_id);
	const auto stage = work.stage().stage;
	auto k = std::make_unique<ClKernelRuntime>();
	k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());

	switch(stage)
	{
	case UnitWorkloadStage::Stage::Run:
	{
	_impl->_kernels.emplace(work.id(), std::move(k));
	break;
	}
	case UnitWorkloadStage::Stage::Prepare:
	{
	_impl->_kernels_prep.emplace(work.id(), std::move(k));
	break;
	}
	default:
	{
	ARM_COMPUTE_ERROR("Invalid unit workload stage");
	}
	}
	}
	// Create auxiliary tensor objects
	create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
	_impl->_is_configured = true;
	return Status{};
	}

	void ClWorkloadRuntime::prepare()
	{
	if(!_impl->_is_prepared)
	{
	for(auto &id_kernel_pair : _impl->_kernels_prep)
	{
	const bool flush_queue = false;
	const auto uwk_id = id_kernel_pair.first;
	auto kernel = id_kernel_pair.second.get();
	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
	}

	_impl->_is_prepared = true;
	}
	}

	Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
	{
	// Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
	// in which case the lut can be cached during prepare
	const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
	ARM_COMPUTE_RETURN_ON_ERROR(st);
	prepare();
	for(auto &id_kernel_pair : _impl->_kernels)
	{
	// Flush the command queue on the last kernel
	const bool flush_queue = false;
	const auto uwk_id = id_kernel_pair.first;
	auto kernel = id_kernel_pair.second.get();
	CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
	}
	return Status{};
	}

	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
	{
	std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
	for(const auto &data : _impl->_aux_tensors.get_tensors())
	{
	aux_tensors.emplace_back(data.tensor, data.memory_info);
	}
	return aux_tensors;
	}
	} // namespace dynamic_fusion
	} // namespace experimental
	} // namespace arm_compute