arm_compute/core/experimental/ClWorkload.h - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H

 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Window.h"

 #include "arm_compute/core/experimental/IWorkload.h"
 #include "arm_compute/core/experimental/OperatorGraph.h"

 #include <map>

 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
 /** Verbose and explicit way to enumerate all the tensor arguments variants used by
  *  all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
  */
 enum class ClKernelTensorArgType : int
 {
     Scalar,

     Vector,

     Image,
     Image_Reinterpret_As_3D,
     Image_Export_To_ClImage2D,

     Image_3D, // 3D Tensor represented as a 2D Image + stride_z
     Image_3D_Export_To_ClImage2D,

     Tensor_3D,
     Tensor_4D,
     Tensor_4D_t_Buffer,
     Tensor_4D_t_Image
 };

 /** Describes all the info required to add a kernel argument at run time
  *
  *  @note This struct can later be expanded into a more concise and formal way to specify how to set up
  *  arguments for a kernel inside a @ref ClUnitWorkload
  */
 struct ClKernelArgDescriptor
 {
     ClKernelArgDescriptor() = default;
     ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
         : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
     {
     }
     ~ClKernelArgDescriptor() = default;
     friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
     {
         return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
     }
     int                   arg_id{ -1 };                                    /**< Arg ID in the blueprint, -1 means empty / uninitialized */
     ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
     bool                  slide_along_dimz{ true };                        /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
 };

 using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;

 /** Descriptor containing information required to run a single ClWorkload
  */
 struct ClExecutionDescriptor
 {
     cl::NDRange suggested_lws{};              /**< Suggested local work-group size for optimal performance if not zero */
     cl::NDRange gws{};                        /**< Global work-group to be used */
     bool        skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
 };

 /** Contains kernel code to be compiled and run in a ClUnitWorkload
  */
 struct ClKernelCode
 {
     friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
     {
         return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
                && (code0.arguments == code1.arguments);
     }
     std::string     name{};          /**< Kernel name */
     std::string     code{};          /**< Kernel source code */
     std::string     config_id{};     /**< Generated from blueprint based on complex component */
     CLBuildOptions  build_options{}; /**< Kernel build options */
     Window          window{};        /**< Execution window */
     ClKernelArgList arguments{};     /**< Kernel argument descriptors. map key is kernel ArgumentID */
 };

 /** A descriptor of ClWorkload Tensors.
  */
 struct ClWorkloadTensor : public WorkloadTensor
 {
     ClWorkloadTensor() = default;
     ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
         : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
     {
     }
     ClKernelArgDescriptor kernel_arg{};
     friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
     {
         return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
     }
 };

 /** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
  */
 struct ClUnitWorkload : public UnitWorkload
 {
     ClUnitWorkload() = default;
     ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
         : UnitWorkload{ id, stage }, code{ code }
     {
     }
     friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
     {
         return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
     }
     ClKernelCode code{};
 };

 /** GPU information for @ref ClWorkloadContext
  */
 struct GpuInfo
 {
     friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
     {
         return info0.target == info1.target;
     }
     GPUTarget target{ GPUTarget::UNKNOWN };
 };

 /** Context (device capabilities, platform details) associated with a ClWorkload
  *
  * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
  */
 struct ClWorkloadContext
 {
     friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
     {
         return ctx0.gpu_info == ctx1.gpu_info;
     }
     GpuInfo gpu_info{};
 };

 /** Workload for Cl backend
  */
 struct ClWorkload : public IWorkload
 {
     Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
     {
         Tid id = graph.add_tensor(merge_point);
         if(tensors.find(id) == tensors.end())
         {
             tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
         }
         return id;
     }
     UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
     {
         auto op            = graph.add_operator(inputs, outputs);
         auto id            = op.second;
         unit_workloads[id] = ClUnitWorkload(id, stage, code);
         return id;
     }
     friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
     {
         return std::make_tuple(
                    workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
                == std::make_tuple(
                    workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
     }
     ClWorkloadContext context{};                             /**< Workload context*/
     std::map<UnitWorkId, ClUnitWorkload> unit_workloads{};   /**< Unit workloads to run*/
     std::map<Tid, ClWorkloadTensor>      tensors{};          /**< Workload tensors*/
     std::map<Tid, OpTensor::Id>          op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
     Status status{};                                         /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
 };

 /** Build a @ref ClWorkload from an @ref OperatorGraph.
  *
  * @param[out] workload
  * @param[in] op_graph
  * @param[in] ctx
  * @return Status
  */
 Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);

 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute

 #endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
	/*
	* Copyright (c) 2022 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
	#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
	#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H

	#include "arm_compute/core/CL/CLCompileContext.h"
	#include "arm_compute/core/GPUTarget.h"
	#include "arm_compute/core/Window.h"

	#include "arm_compute/core/experimental/IWorkload.h"
	#include "arm_compute/core/experimental/OperatorGraph.h"

	#include <map>

	namespace arm_compute
	{
	namespace experimental
	{
	namespace dynamic_fusion
	{
	/** Verbose and explicit way to enumerate all the tensor arguments variants used by
	* all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
	*/
	enum class ClKernelTensorArgType : int
	{
	Scalar,

	Vector,

	Image,
	Image_Reinterpret_As_3D,
	Image_Export_To_ClImage2D,

	Image_3D, // 3D Tensor represented as a 2D Image + stride_z
	Image_3D_Export_To_ClImage2D,

	Tensor_3D,
	Tensor_4D,
	Tensor_4D_t_Buffer,
	Tensor_4D_t_Image
	};

	/** Describes all the info required to add a kernel argument at run time
	*
	* @note This struct can later be expanded into a more concise and formal way to specify how to set up
	* arguments for a kernel inside a @ref ClUnitWorkload
	*/
	struct ClKernelArgDescriptor
	{
	ClKernelArgDescriptor() = default;
	ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
	: arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
	{
	}
	~ClKernelArgDescriptor() = default;
	friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
	{
	return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
	}
	int arg_id{ -1 }; /*< Arg ID in the blueprint, -1 means empty / uninitialized /
	ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /*< tensor argument type /
	bool slide_along_dimz{ true }; /*< @note slide_along_dimz will be moved out of this descriptor in later iterations /
	};

	using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;

	/** Descriptor containing information required to run a single ClWorkload
	*/
	struct ClExecutionDescriptor
	{
	cl::NDRange suggested_lws{}; /*< Suggested local work-group size for optimal performance if not zero /
	cl::NDRange gws{}; /*< Global work-group to be used /
	bool skip_sliding_window{ false }; /*< Skip sliding window slices during execution loop /
	};

	/** Contains kernel code to be compiled and run in a ClUnitWorkload
	*/
	struct ClKernelCode
	{
	friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
	{
	return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
	&& (code0.arguments == code1.arguments);
	}
	std::string name{}; /*< Kernel name /
	std::string code{}; /*< Kernel source code /
	std::string config_id{}; /*< Generated from blueprint based on complex component /
	CLBuildOptions build_options{}; /*< Kernel build options /
	Window window{}; /*< Execution window /
	ClKernelArgList arguments{}; /*< Kernel argument descriptors. map key is kernel ArgumentID /
	};

	/** A descriptor of ClWorkload Tensors.
	*/
	struct ClWorkloadTensor : public WorkloadTensor
	{
	ClWorkloadTensor() = default;
	ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
	: WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
	{
	}
	ClKernelArgDescriptor kernel_arg{};
	friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
	{
	return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
	}
	};

	/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
	*/
	struct ClUnitWorkload : public UnitWorkload
	{
	ClUnitWorkload() = default;
	ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
	: UnitWorkload{ id, stage }, code{ code }
	{
	}
	friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
	{
	return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
	}
	ClKernelCode code{};
	};

	/** GPU information for @ref ClWorkloadContext
	*/
	struct GpuInfo
	{
	friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
	{
	return info0.target == info1.target;
	}
	GPUTarget target{ GPUTarget::UNKNOWN };
	};

	/** Context (device capabilities, platform details) associated with a ClWorkload
	*
	* It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
	*/
	struct ClWorkloadContext
	{
	friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
	{
	return ctx0.gpu_info == ctx1.gpu_info;
	}
	GpuInfo gpu_info{};
	};

	/** Workload for Cl backend
	*/
	struct ClWorkload : public IWorkload
	{
	Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
	{
	Tid id = graph.add_tensor(merge_point);
	if(tensors.find(id) == tensors.end())
	{
	tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
	}
	return id;
	}
	UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
	{
	auto op = graph.add_operator(inputs, outputs);
	auto id = op.second;
	unit_workloads[id] = ClUnitWorkload(id, stage, code);
	return id;
	}
	friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
	{
	return std::make_tuple(
	workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
	== std::make_tuple(
	workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
	}
	ClWorkloadContext context{}; /*< Workload context/
	std::map<UnitWorkId, ClUnitWorkload> unit_workloads{}; /*< Unit workloads to run/
	std::map<Tid, ClWorkloadTensor> tensors{}; /*< Workload tensors/
	std::map<Tid, OpTensor::Id> op_tensor_id_lut{}; /*< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)/
	Status status{}; /*< For compatibility with the IOperator validate method. Store if the workload is valid or not. /
	};

	/** Build a @ref ClWorkload from an @ref OperatorGraph.
	*
	* @param[out] workload
	* @param[in] op_graph
	* @param[in] ctx
	* @return Status
	*/
	Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);

	} // namespace dynamic_fusion
	} // namespace experimental
	} // namespace arm_compute

	#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
	#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */