Blame - compute_kernel_writer/src/Prototype.h - ml/ComputeLibrary

blob: 45f1b3d4640cd9e866886ba347436ef14f44252a [file] [log] [blame]

Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#ifndef CKW_SRC_PROTOTYPE_H
				26	#define CKW_SRC_PROTOTYPE_H
				27
				28	#include <vector>
				29	#include <map>
				30	#include <string>
				31	#include <cstdint> // int32_t
				32	#include <iostream> // cout (to be removed)
				33	#include <cassert> // assert (to be removed)
				34	#include <unordered_map>
				35	#include <chrono>
				36	#include <cmath>
				37	#include <memory>
				38	#include <algorithm>
				39	#include <array>
				40	#include <stdexcept>
				41
				42	#include "ckw/Types.h"
				43	#include "ckw/TensorInfo.h"
				44	#include "ckw/Error.h"
				45
				46	namespace ckw
				47	{
				48	namespace prototype {
				49
				50	// Dummy data structure for Size2D
				51	using Size2D = std::vector<int32_t>;
				52
				53	// Dummy Status
				54	using Status = void;
				55
				56	enum class ComponentType : int32_t
				57	{
				58	Complex = 0,
				59	Simple = 1,
				60	Unfusable = 2
				61	};
				62
				63	enum class GpuCompilationSpeed
				64	{
				65	Fast = 0x00, // fast compilation may increase the latency of the network
				66	Slow = 0x01 // slow compilation may decrease the latency of the network
				67	};
				68
				69	enum class GpuExtensions
				70	{
				71	Fp16,
				72	Dot8,
				73	Mmul,
				74	FastMath
				75	};
				76
				77	struct TensorInfo
				78	{
				79	TensorShape shape { {0} };
				80	DataType data_type { DataType::Unknown };
				81	TensorDataLayout data_layout { TensorDataLayout::Nhwc };
				82	int32_t id { -1 };
				83	};
				84
				85	struct ComponentAttribute
				86	{
				87	GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
				88	bool overwrite_tile { true };
				89	};
				90
				91	inline std::string data_type_to_cl_type(DataType dt)
				92	{
				93	switch(dt)
				94	{
				95	case DataType::Fp32:
				96	return "float";
				97	case DataType::Fp16:
				98	return "half";
				99	case DataType::Int8:
				100	return "char";
				101	case DataType::Uint8:
				102	return "uchar";
				103	case DataType::Uint16:
				104	return "ushort";
				105	case DataType::Int16:
				106	return "short";
				107	case DataType::Uint32:
				108	return "uint";
				109	case DataType::Int32:
				110	return "int";
				111	case DataType::Bool:
				112	return "bool";
				113	default:
				114	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	115	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	116	}
				117	}
				118
				119	inline int32_t width_to_cl_vector_size(int32_t width)
				120	{
				121	switch(width)
				122	{
				123	case 1:
				124	return 1;
				125	case 2:
				126	return 2;
				127	case 3:
				128	return 3;
				129	case 4:
				130	return 4;
				131	case 5:
				132	case 6:
				133	case 7:
				134	case 8:
				135	return 8;
				136	case 9:
				137	case 10:
				138	case 11:
				139	case 12:
				140	case 13:
				141	case 14:
				142	case 15:
				143	case 16:
				144	return 16;
				145	default:
				146	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	147	return 0;
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	148	}
				149	}
				150
				151	inline std::string get_cl_data_type(DataType dt, int32_t width)
				152	{
				153	std::string data_type;
				154	int32_t w = width_to_cl_vector_size(width);
				155	data_type += data_type_to_cl_type(dt);
				156	if(w != 1)
				157	{
				158	data_type += std::to_string(w);
				159	}
				160	return data_type;
				161	}
				162
				163	inline std::string to_opencl_store(int32_t vector_length)
				164	{
				165	if(vector_length != 1)
				166	{
				167	return "vstore" + std::to_string(vector_length) + "(";
				168	}
				169	else
				170	{
				171	return "*(";
				172	}
				173	}
				174
				175	struct TileInfo
				176	{
				177	TileInfo() {}
				178	TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
				179	TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
				180	TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
				181	DataType dt{ DataType::Unknown }; // Data type of the tile
				182	int32_t w{ 0 }; // Width (i.e. c0 - portion of the channels)
				183	int32_t h{ 0 }; // Height (i.e. s0 - portion of the spatial dimensions)
				184	};
				185
				186	inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
				187	{
				188	o << a.w << " x " << a.h;
				189	return o;
				190	}
				191
				192	struct DataTypeAsString
				193	{
				194	std::string str { "" };
				195	DataType dt { DataType::Unknown };
				196	int32_t size { 1 };
				197	};
				198
				199	struct ValueAsString
				200	{
				201	std::string str { "" };
				202	DataTypeAsString type { };
				203	};
				204
				205	// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
				206	// A Tile is a collection of variables used to express a 2D data.
				207	class IScalarTile
				208	{
				209	public:
				210	virtual ~IScalarTile() = default;
				211	/** Method to get the scalar variable from a tile
				212	* @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				213	* @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				214	*
				215	* @return the scalar variable as a string
				216	*/
				217	virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
				218	/** Method to get the list of underlying variable names used by the tile
				219	*
				220	* @return the list of variable names
				221	*/
				222	virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
				223	/** Method to get the name of the tile.
				224	*
				225	* @return the name of the tile
				226	*/
				227	std::string name() const
				228	{
				229	return _basename;
				230	}
				231	/** Method to get the tile format
				232	*
				233	* @return the format
				234	*/
				235	TileInfo format() const
				236	{
				237	return _format;
				238	}
				239	/** Method to know whether the tile is assignable or not (constant)
				240	*
				241	* @return true if the tile is assignable
				242	*/
				243	virtual bool is_assignable() const = 0;
				244	/** Method to know whether the tile needs to be declared
				245	*
				246	* @return true if the tile needs to be declared in the code before being used
				247	*/
				248	virtual bool need_declaration() const = 0;
				249	protected:
				250	TileInfo _format { }; // Tile format
				251	std::string _basename { "" }; // Tile name
				252	};
				253
				254	// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
				255	// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
				256	class IVectorTile : public IScalarTile
				257	{
				258	public:
				259	virtual ~IVectorTile() = default;
				260	/** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
				261	* The user can query the list of supported width for the vectors through preferred_vector_sizes().
				262	*
				263	* @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				264	*
				265	* @return the vector variable as a string
				266	*/
				267	virtual ValueAsString vector(int32_t y) const = 0;
				268	/** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
				269	*
				270	* @return the vector variable as a string
				271	*/
				272	virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
				273	/** Method to get the preferred vector sizes.
				274	*
				275	* @return a vector with the preferred vector sizes
				276	*/
				277	//virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
				278	};
				279
				280	class ClTile : public IVectorTile
				281	{
				282	public:
				283	ClTile(const std::string& name, TileInfo format)
				284	{
				285	_format = format;
				286	_basename = name;
				287	}
				288
				289	ValueAsString scalar(int32_t x, int32_t y) const override
				290	{
				291	x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
				292	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				293
				294	ValueAsString t;
				295	t.str = build_variable_name(y);
				296	t.type.str = get_cl_data_type(_format.dt, 1);
				297	t.type.dt = _format.dt;
				298	t.type.size = 1;
				299
				300	// Check required because if the width has only one element, we cannot use .s0
				301	if(_format.w != 1)
				302	{
				303	// Automatic broadcasting
				304	t.str += ".s" + std::to_string(x);
				305	}
				306
				307	return t;
				308	}
				309
				310	ValueAsString vector(int32_t y) const override
				311	{
				312	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				313
				314	ValueAsString t;
				315	t.str = build_variable_name(y);
				316	t.type.str = get_cl_data_type(_format.dt, _format.w);
				317	t.type.dt = _format.dt;
				318	t.type.size = _format.w;
				319	return t;
				320	}
				321
				322	ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
				323	{
				324	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				325
				326	ValueAsString t;
				327	t.str = build_variable_name(y);
				328	t.type.str = get_cl_data_type(_format.dt, width);
				329	t.type.dt = _format.dt;
				330	t.type.size = width;
				331
				332	if(_format.w != 1)
				333	{
				334	t.str += ".s";
				335	for(int i = 0; i < width; ++i)
				336	{
				337	t.str += to_scalar_hex(x_start + i);
				338	}
				339	}
				340	return t;
				341	}
				342
				343	std::vector<ValueAsString> underlying_source_variables() const override
				344	{
				345	std::vector<ValueAsString> vars;
				346	for(int32_t y = 0; y < _format.h; ++y)
				347	{
				348	ValueAsString t;
				349	t.str = build_variable_name(y);
				350	t.type.str = get_cl_data_type(_format.dt, _format.w);
				351	t.type.dt = _format.dt;
				352	t.type.size = _format.w;
				353	vars.push_back(t);
				354	}
				355	return vars;
				356	}
				357
				358	bool is_assignable() const override
				359	{
				360	return true;
				361	}
				362
				363	bool need_declaration() const override
				364	{
				365	return true;
				366	}
				367
				368	private:
				369	std::string build_variable_name(int32_t y) const
				370	{
				371	std::string var_name = _basename;
				372
				373	if(_format.h == 1)
				374	{
				375	return var_name;
				376
				377	}
				378	else
				379	{
				380	var_name += "_";
				381	var_name += std::to_string(y);
				382	}
				383
				384	return var_name;
				385	}
				386
				387	std::string to_scalar_hex(int32_t x) const
				388	{
				389	switch(x)
				390	{
				391	case 0:
				392	case 1:
				393	case 2:
				394	case 3:
				395	case 4:
				396	case 5:
				397	case 6:
				398	case 7:
				399	case 8:
				400	case 9:
				401	return std::to_string(x);
				402	case 10:
				403	return "A";
				404	case 11:
				405	return "B";
				406	case 12:
				407	return "C";
				408	case 13:
				409	return "D";
				410	case 14:
				411	return "E";
				412	case 15:
				413	return "F";
				414	default:
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	415	std::cout << "Unsupported hexadecimal value" << std::endl;
				416	assert(false);
				417	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	418	}
				419	}
				420	};
				421
				422	// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
				423	class ClConstantTile : public IVectorTile
				424	{
				425	public:
				426	ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
				427	{
				428	_format.w = in[0].size();
				429	_format.h = in.size();
				430	_format.dt = dt;
				431
				432	_data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
				433
				434	for(int32_t y = 0; y < _format.h; ++y)
				435	{
				436	for(int32_t x = 0; x < _format.w; ++x)
				437	{
				438	_data[y][x] = in[y][x];
				439	}
				440	}
				441	}
				442
				443	ValueAsString scalar(int32_t x, int32_t y) const override
				444	{
				445	x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
				446	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				447
				448	ValueAsString t;
				449	t.str = _data[y][x];
				450	t.type.str = get_cl_data_type(_format.dt, 1);
				451	t.type.dt = _format.dt;
				452	t.type.size = 1;
				453
				454	return t;
				455	}
				456
				457	ValueAsString vector(int32_t y) const override
				458	{
				459	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				460
				461	return vector(0, _format.w, y);
				462	}
				463
				464	ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
				465	{
				466	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				467
				468	ValueAsString t;
				469	t.str = "";
				470	t.type.str = get_cl_data_type(_format.dt, width);
				471	t.type.dt = _format.dt;
				472	t.type.size = width;
				473
				474	if(width > 1)
				475	{
				476	t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
				477	}
				478
				479	int32_t x = x_start;
				480	for(; x < width - 1; ++x)
				481	{
				482	t.str += scalar(x, y).str;
				483	t.str += ", ";
				484	}
				485	t.str += scalar(x, y).str;
				486
				487	if(width > 1)
				488	{
				489	t.str += "))";
				490	}
				491
				492	return t;
				493	}
				494
				495	std::vector<ValueAsString> underlying_source_variables() const override
				496	{
				497	std::vector<ValueAsString> vars;
				498
				499	for(int32_t y = 0; y < _format.h; ++y)
				500	{
				501	for(int32_t x = 0; x < _format.w; ++x)
				502	{
				503	ValueAsString t;
				504	t.str = _data[y][x];
				505	t.type.str = get_cl_data_type(_format.dt, 1);
				506	t.type.dt = _format.dt;
				507	t.type.size = 1;
				508	vars.push_back(t);
				509	}
				510	}
				511
				512	return vars;
				513	}
				514
				515	bool is_assignable() const override
				516	{
				517	return false;
				518	}
				519
				520	bool need_declaration() const override
				521	{
				522	return false;
				523	}
				524
				525	private:
				526	std::vector<std::vector<std::string>> _data{};
				527	};
				528
				529	enum class TensorComponentIndex : int32_t
				530	{
				531	IndexMask = 0x0000000f,
				532	};
				533
				534	enum class TensorComponentType : int32_t
				535	{
				536	OffsetFirstElement = 0x00000100,
				537	Stride = 0x00001000,
				538	Dimension = 0x00010000,
				539	FoldedDimension = 0x00100000,
				540	Constant = 0x01000000
				541	};
				542
				543	enum class TensorComponent : int32_t
				544	{
				545	Unknown = 0x00000000,
				546	OffsetFirstElement = 0x00000100,
				547	Stride1 = 0x00001001,
				548	Stride2 = 0x00001002,
				549	Stride3 = 0x00001003,
				550	Stride4 = 0x00001004,
				551	Dim0 = 0x00010000,
				552	Dim1 = 0x00010001,
				553	Dim2 = 0x00010002,
				554	Dim3 = 0x00010003,
				555	Dim4 = 0x00010004,
				556	C = 0x00010000, // Dim0
				557	W = 0x00010001, // Dim1
				558	H = 0x00010002, // Dim2
				559	D = 0x00010003,
				560	N = 0x00010004,
				561	Dim1xDim2 = 0x00100021,
				562	Dim1xDim2xDim3 = 0x00100321,
				563	WxH = 0x00100021,
				564	WxHxD = 0x00100321
				565	};
				566
				567	inline std::string to_string(TensorComponent x)
				568	{
				569	switch(x)
				570	{
				571	case TensorComponent::Unknown:
				572	return "Unknown";
				573	case TensorComponent::OffsetFirstElement:
				574	return "OffsetFirstElement";
				575	case TensorComponent::Stride1:
				576	return "Stride1";
				577	case TensorComponent::Stride2:
				578	return "Stride2";
				579	case TensorComponent::Stride3:
				580	return "Stride3";
				581	case TensorComponent::Stride4:
				582	return "Stride4";
				583	case TensorComponent::Dim0:
				584	return "Dim0";
				585	case TensorComponent::Dim1:
				586	return "Dim1";
				587	case TensorComponent::Dim2:
				588	return "Dim2";
				589	case TensorComponent::Dim3:
				590	return "Dim3";
				591	case TensorComponent::Dim4:
				592	return "Dim4";
				593	case TensorComponent::Dim1xDim2:
				594	return "Dim1xDim2";
				595	case TensorComponent::Dim1xDim2xDim3:
				596	return "Dim1xDim2xDim3";
				597	default:
				598	assert(false);
				599	}
				600	}
				601
				602	class ITensorArgument
				603	{
				604	public:
				605	virtual ~ITensorArgument() = default;
				606	/** Method to get the tensor component as a string
				607	*
				608	* @param[in] x tensor component to query
				609	*
				610	* @return the tensor component as a string
				611	*/
				612	virtual std::string component(TensorComponent x) = 0;
				613	/** Method to get the tensor component type declaration as a string
				614	*
				615	* @return the tensor component type declaration as a string
				616	*/
				617	virtual std::string component_type_declaration() const = 0;
				618	/** Method to get the tensor component data type
				619	*
				620	* @return the tensor component data type
				621	*/
				622	virtual DataType component_data_type() const = 0;
				623	/** Method to get the tensor component declarations
				624	*
				625	* @return a vector containing the tensor component declarations
				626	*/
				627	virtual std::vector<TensorComponent> component_declarations() const = 0;
				628	/** Method to get the name of the tensor argument.
				629	*
				630	* @return the name of the tensor argument
				631	*/
				632	std::string name() const
				633	{
				634	return _basename;
				635	}
				636	/** Method to get the tensor format
				637	*
				638	* @return the format
				639	*/
				640	TensorInfo format() const
				641	{
				642	return _format;
				643	}
				644
				645	protected:
				646	TensorInfo _format { };
				647	std::string _basename {};
				648	};
				649
				650	enum class GpuTensorStorage : int32_t
				651	{
				652	Unknown = 0x0000,
				653	BufferUint8Ptr = 0x0012,
				654	Image2dReadOnly = 0x0020,
				655	Image2dWriteOnly = 0x0021,
				656	Image3dReadOnly = 0x0030,
				657	Image3dWriteOnly = 0x0031
				658	};
				659
				660	class IGpuTensorArgument : public ITensorArgument
				661	{
				662	public:
				663	virtual ~IGpuTensorArgument() = default;
				664	/** Method to get the tensor storage, which is the underlying storage used to keep the data memory
				665	*
				666	* @param[in] x tensor storage to query
				667	*
				668	* @return the tensor storage as a string
				669	*/
				670	virtual std::string storage(GpuTensorStorage x) = 0;
				671	/** Method to get the tensor storage type declaration as a string
				672	*
				673	* @param[in] x tensor component to query
				674	*
				675	* @return the tensor storage type declaration as a string
				676	*/
				677	virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
				678	/** Method to get the tensor storage declarations
				679	*
				680	* @return a vector containing the tensor storage declarations
				681	*/
				682	virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
				683	};
				684
				685	class ClTensorArgument : public IGpuTensorArgument
				686	{
				687	public:
				688	ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
				689	{
				690	_basename = name;
				691	_format = x;
				692	_return_by_value_when_possible = return_by_value_when_possible;
				693	}
				694
				695	// Methods to override
				696	std::string component(TensorComponent x) override
				697	{
				698	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
				699	{
				700	int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
				701	return std::to_string(idx - 1);
				702	}
				703
				704	if(_return_by_value_when_possible)
				705	{
				706	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
				707	{
				708	int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
				709	return std::to_string(_format.shape[idx]);
				710	}
				711
				712	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
				713	{
				714	switch(x)
				715	{
				716	case TensorComponent::Dim1xDim2:
				717	return std::to_string(_format.shape[1] * _format.shape[2]);
				718	case TensorComponent::Dim1xDim2xDim3:
				719	return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
				720	default:
				721	std::cout << "Unsupported folded dimension" << std::endl;
				722	assert(false);
				723	}
				724	}
				725	}
				726
				727	if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
				728	{
				729	_components_required.push_back(x);
				730	}
				731
				732	return build_component_name(x);
				733	}
				734
				735	std::string component_type_declaration() const override
				736	{
				737	return "int";
				738	};
				739
				740	DataType component_data_type() const override
				741	{
				742	return DataType::Int32;
				743	}
				744
				745	std::string storage(GpuTensorStorage x) override
				746	{
				747	if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
				748	{
				749	_storage_required.push_back(x);
				750	}
				751
				752	return build_storage_name(x);
				753	}
				754
				755	std::string storage_type_declaration(GpuTensorStorage x) const override
				756	{
				757	switch(x)
				758	{
				759	case GpuTensorStorage::BufferUint8Ptr:
				760	return "__global uchar*";
				761	case GpuTensorStorage::Image2dReadOnly:
				762	return "__read_only image2d_t";
				763	case GpuTensorStorage::Image2dWriteOnly:
				764	return "__write_only image2d_t";
				765	case GpuTensorStorage::Image3dReadOnly:
				766	return "__read_only image3d_t ";
				767	case GpuTensorStorage::Image3dWriteOnly:
				768	return "__write_only image3d_t ";
				769	default:
				770	std::cout << "Unsupported storage" << std::endl;
				771	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	772	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	773	}
				774	};
				775
				776	std::vector<GpuTensorStorage> storage_declarations() const override
				777	{
				778	return _storage_required;
				779	}
				780
				781	std::vector<TensorComponent> component_declarations() const override
				782	{
				783	return _components_required;
				784	}
				785
				786	private:
				787	std::string build_storage_name(GpuTensorStorage x) const
				788	{
				789	std::string var_name = _basename;
				790
				791	switch(x)
				792	{
				793	case GpuTensorStorage::BufferUint8Ptr:
				794	return var_name + "_ptr";
				795	case GpuTensorStorage::Image2dReadOnly:
				796	case GpuTensorStorage::Image2dWriteOnly:
				797	return var_name + "_img2d";
				798	case GpuTensorStorage::Image3dReadOnly:
				799	case GpuTensorStorage::Image3dWriteOnly:
				800	return var_name + "_img3d";
				801	default:
				802	std::cout << "Unsupported storage" << std::endl;
				803	assert(false);
				804	}
				805
				806	return var_name;
				807	}
				808
				809	std::string build_component_name(TensorComponent x) const
				810	{
				811	std::string var_name = _basename;
				812
				813	switch(x)
				814	{
				815	case TensorComponent::OffsetFirstElement:
				816	return var_name + "_offset_first_element";
				817	case TensorComponent::Stride1:
				818	return var_name + "_stride1";
				819	case TensorComponent::Stride2:
				820	return var_name + "_stride2";
				821	case TensorComponent::Stride3:
				822	return var_name + "_stride3";
				823	case TensorComponent::Dim0:
				824	return var_name + "_dim0";
				825	case TensorComponent::Dim1:
				826	return var_name + "_dim1";
				827	case TensorComponent::Dim2:
				828	return var_name + "_dim2";
				829	case TensorComponent::Dim3:
				830	return var_name + "_dim3";
				831	case TensorComponent::Dim1xDim2:
				832	return var_name + "_dim1xdim2";
				833	case TensorComponent::Dim1xDim2xDim3:
				834	return var_name + "_dim1xdim2xdim3";
				835	default:
				836	std::cout << "Unsupported component" << std::endl;
				837	assert(false);
				838	}
				839
				840	return var_name;
				841	}
				842
				843	bool _return_by_value_when_possible { false };
				844	std::vector<GpuTensorStorage> _storage_required {};
				845	std::vector<TensorComponent> _components_required {};
				846	};
				847
				848	/**
				849	* @brief Data structure that contains the declared tiles by the components.
				850	* The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
				851	* increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
				852	* and remove (pop) all the tiles from the level above.
				853	* When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
				854	* Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
				855	* when declaring tiles among different components.
				856	*
				857	*/
				858	class GpuTileRegistry
				859	{
				860	public:
				861	enum class RegistryTileType
				862	{
				863	Tile,
				864	Link
				865	};
				866
				867	using RegistryIdSpace = int32_t;
				868	using RegistryLevel = int32_t;
				869	using RegistryTileName = std::string;
				870
				871	struct RegistryTileTableEntry
				872	{
				873	RegistryLevel registry_level { 0 };
				874	std::unique_ptr<IVectorTile> tile_object { nullptr };
				875	};
				876
				877	struct RegistryTileTypeTableEntry
				878	{
				879	RegistryTileType tile_type { RegistryTileType::Tile };
				880	RegistryTileName tile_name {};
				881	RegistryIdSpace registry_idspace { 0 };
				882	RegistryLevel registry_level { 0 };
				883	};
				884
				885	using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
				886	using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
				887	/**
				888	* @brief Construct a new Gpu Tile Registry object
				889	*
				890	*/
				891	GpuTileRegistry()
				892	{
				893	_language = GpuTargetLanguage::Unknown;
				894	}
				895	/**
				896	* @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
				897	*
				898	* @param[in] language Gpu programming language to use
				899	*/
				900	GpuTileRegistry(GpuTargetLanguage language)
				901	{
				902	_language = language;
				903	}
				904	/**
				905	* @brief Default destructor. Destroy the Gpu Tile Registry object
				906	*
				907	*/
				908	~GpuTileRegistry() = default;
				909	/**
				910	* @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
				911	* Therefore, the IdSpace should be set before declaring any tiles.
				912	*
				913	* @param[in] id The IdSpace id
				914	*/
				915	void set_IdSpace(int32_t id)
				916	{
				917	_IdSpace = id;
				918	}
				919	/**
				920	* @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
				921	*
				922	* @return The IdSpace id
				923	*/
				924	int32_t IdSpace() const
				925	{
				926	return _IdSpace;
				927	}
				928	/**
				929	* @brief Gets all the IdSpace declarations defined in the tile registry.
				930	*
				931	* @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
				932	*/
				933	std::vector<int32_t> IdSpace_declarations() const
				934	{
				935	std::vector<int32_t> x;
				936
				937	auto it = _frags.begin();
				938
				939	while (it != _frags.end())
				940	{
				941	x.push_back(it->first);
				942
				943	it++;
				944	}
				945
				946	return x;
				947	}
				948	/**
				949	* @brief Declare a tile from a previously created tile
				950	*/
				951	void insert(const std::string& name, const IVectorTile *frag)
				952	{
				953	assert(_language == GpuTargetLanguage::OpenCL);
				954	const int32_t key_IdSpace = _IdSpace;
				955	const std::string key_var_name = name;
				956	const std::string var_name = frag->name();
				957	TileInfo format = frag->format();
				958
				959	// First check whether a tile with the same name exists
				960	IVectorTile result = (this)[key_var_name];
				961	assert(result == nullptr);
				962	if(result == nullptr)
				963	{
				964	std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
				965
				966	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				967	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				968
				969	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link;
				970	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				971	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				972	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				973	}
				974	}
				975	/**
				976	* @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
				977	*
				978	* @note The reference name used for declaring the tile should not be previously used in the IdSpace
				979	*
				980	* @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
				981	* @param[in] format Tile format use to use
				982	*/
				983	void insert(const std::string& name, const TileInfo& format)
				984	{
				985	assert(_language == GpuTargetLanguage::OpenCL);
				986	const int32_t key_IdSpace = _IdSpace;
				987	const std::string key_var_name = name;
				988	const std::string var_name = generate_tile_name(name);
				989
				990	// First check whether a tile with the same name exists
				991	IVectorTile result = (this)[key_var_name];
				992	assert(result == nullptr);
				993	if(result == nullptr)
				994	{
				995	std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
				996	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				997	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				998
				999	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				1000	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				1001	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				1002	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				1003	}
				1004	}
				1005	/**
				1006	* @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
				1007	*
				1008	* @note The reference name used for declaring the tile should not be previously used in the IdSpace
				1009	*
				1010	* @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
				1011	* @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
				1012	* @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
				1013	* that the data type is aligned with the content of the std::string.
				1014	*/
				1015	void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
				1016	{
				1017	assert(_language == GpuTargetLanguage::OpenCL);
				1018	const int32_t key_IdSpace = _IdSpace;
				1019	const std::string key_var_name = name;
				1020
				1021	// First check whether a tile with the same name exists
				1022	IVectorTile result = (this)[key_var_name];
				1023	assert(result == nullptr);
				1024	if(result == nullptr)
				1025	{
				1026	std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
				1027	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				1028	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				1029
				1030	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				1031	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				1032	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				1033	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				1034	}
				1035	}
				1036	/**
				1037	* @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
				1038	*
				1039	* @note This method can be used to declare temporary tiles that need to be accessed only once.
				1040	*
				1041	* @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
				1042	* @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
				1043	* that the data type is aligned with what passed with the std::string.
				1044	*
				1045	* @return IVectorTile* the anonymous constant tile
				1046	*/
				1047	IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
				1048	{
				1049	assert(_language == GpuTargetLanguage::OpenCL);
				1050	const int32_t key_IdSpace = _IdSpace;
				1051	const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
				1052
				1053	// First check whether a tile with the same name exists
				1054	IVectorTile result = (this)[key_var_name];
				1055	assert(result == nullptr);
				1056	if(result == nullptr)
				1057	{
				1058	std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
				1059	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				1060	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				1061
				1062	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				1063	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				1064	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				1065	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				1066	}
				1067
				1068	return (*this)[key_var_name];
				1069	}
				1070	/**
				1071	* @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
				1072	*
				1073	* @param[in] name The name of the tile to retrieve
				1074	* @param[in] IdSpace The IdSpace id where to search the tile
				1075	*
				1076	* @return IVectorTile* The tile
				1077	*/
				1078	IVectorTile* get(const std::string& name, int32_t IdSpace)
				1079	{
				1080	const int32_t key_IdSpace = IdSpace;
				1081	const std::string key_var_name = name;
				1082
				1083	IVectorTile* result = nullptr;
				1084	auto search_IdSpace = _frags.find(key_IdSpace);
				1085	if(search_IdSpace != _frags.end())
				1086	{
				1087	auto search_tile = _frags[key_IdSpace].find(key_var_name);
				1088	if(search_tile != _frags[key_IdSpace].end())
				1089	{
				1090	result = search_tile->second.tile_object.get();
				1091	assert(result != nullptr);
				1092	}
				1093	}
				1094
				1095	return result;
				1096	}
				1097	/**
				1098	* @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
				1099	*
				1100	* @param[in] name The name of the tile to retrieve
				1101	*
				1102	* @return IVectorTile* The tile
				1103	*/
				1104	IVectorTile* operator[](const std::string& name)
				1105	{
				1106	return get(name, _IdSpace);
				1107	}
				1108	/**
				1109	* @brief Check whether the tile in the in the IdSpace provided by the user exists
				1110	*
				1111	* @param[in] name Name of the tile to search for
				1112	* @param[in] IdSpace The IdSpace id where to search the tile
				1113	*
				1114	* @return true if the tile exists
				1115	* @return false if the tile does not exist
				1116	*/
				1117	bool has_tile(const std::string& name, int32_t IdSpace) const
				1118	{
				1119	const int32_t key_IdSpace = IdSpace;
				1120	const std::string key_var_name = name;
				1121
				1122	// IVectorTile* result = nullptr;
				1123	auto search_IdSpace = _frags.find(key_IdSpace);
				1124
				1125	return search_IdSpace != _frags.end();
				1126	}
				1127	/**
				1128	* @brief Check whether the tile within the current IdSpace exists
				1129	*
				1130	* @param[in] name Name of the tile to search for
				1131	*
				1132	* @return true if the tile exists
				1133	* @return false if the tile does not exist
				1134	*/
				1135	bool has_tile(const std::string& name) const
				1136	{
				1137	return has_tile(name, _IdSpace);
				1138	}
				1139	/**
				1140	* @brief Get all the tiles declared within the IdSpace provided by the user
				1141	*
				1142	* @param[in] IdSpace IdSpace where to retrieve all the declared tiles
				1143	*
				1144	* @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
				1145	*/
				1146	std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
				1147	{
				1148	std::vector<IVectorTile*> tiles;
				1149
				1150	std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
				1151
				1152	while (it != _frag_types[IdSpace].end())
				1153	{
				1154	// The following line should be enabled. However, we cannot at this stage
				1155	// because it used to retrieve the output tile produced by each component.
				1156	// However, this method should NOT be used to retrieve the output tile
				1157	//if(it->second.tile_type == RegistryTileType::Tile)
				1158	{
				1159	tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
				1160	}
				1161	it++;
				1162	}
				1163
				1164	return tiles;
				1165	}
				1166	/**
				1167	* @brief Increase the level of stack.
				1168	*
				1169	*/
				1170	void increment_registry_level()
				1171	{
				1172	_registry_level++;
				1173	}
				1174	/**
				1175	* @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
				1176	*
				1177	*/
				1178	void decrement_registry_level()
				1179	{
				1180	assert(_registry_level >= 0);
				1181
				1182	// Remove all variables in the local scope
				1183	std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
				1184
				1185	while (it != _frags[_IdSpace].end())
				1186	{
				1187	if (it->second.registry_level == _registry_level)
				1188	{
				1189	it = _frags[_IdSpace].erase(it);
				1190	}
				1191	else
				1192	{
				1193	it++;
				1194	}
				1195	}
				1196
				1197	std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
				1198
				1199	while (it_type != _frag_types[_IdSpace].end())
				1200	{
				1201	if (it_type->second.registry_level == _registry_level)
				1202	{
				1203	it_type = _frag_types[_IdSpace].erase(it_type);
				1204	}
				1205	else
				1206	{
				1207	it_type++;
				1208	}
				1209	}
				1210
				1211	_registry_level--;
				1212	}
				1213	/**
				1214	* @brief Get the level of the stack
				1215	*
				1216	*/
				1217	int32_t level() const
				1218	{
				1219	return _registry_level;
				1220	}
				1221
				1222	private:
				1223	// This method ensures that the key is unique among different components
				1224	std::string generate_tile_name(const std::string& name)
				1225	{
				1226	assert(_IdSpace >= 0 );
				1227	if(_registry_level == 0)
				1228	{
				1229	return "_G" + std::to_string(_IdSpace) + "_" + name;
				1230	}
				1231	else
				1232	{
				1233	return name;
				1234	}
				1235	}
				1236	RegistryTileTable _frags {};
				1237	RegistryTileTypeTable _frag_types {};
				1238	RegistryLevel _registry_level { 0 };
				1239	RegistryIdSpace _IdSpace { -1 };
				1240	int32_t _anonymous_frag_count { 0 }; // Counter used to create the anonymous tiles
				1241	GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
				1242	};
				1243
				1244	using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
				1245
				1246	/**
				1247	* @brief Data structure that contains the tensors consumed by the components.
				1248	* Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
				1249	* when declaring tensors among different components.
				1250	*
				1251	*/
				1252	class GpuTensorArgumentRegistry
				1253	{
				1254	public:
				1255	/**
				1256	* @brief Construct a new Gpu Tensor Registry object
				1257	*
				1258	*/
				1259	GpuTensorArgumentRegistry()
				1260	{
				1261	_language = GpuTargetLanguage::Unknown;
				1262	}
				1263	/**
				1264	* @brief Construct a new Gpu Tensor Registry object
				1265	*
				1266	* @param[in] language Gpu programming language to use
				1267	*/
				1268	GpuTensorArgumentRegistry(GpuTargetLanguage language)
				1269	{
				1270	_language = language;
				1271	}
				1272	/**
				1273	* @brief Default destructor. Destroy the Gpu Tensor Registry object
				1274	*
				1275	*/
				1276	~GpuTensorArgumentRegistry() = default;
				1277	/**
				1278	* @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
				1279	* Therefore, the IdSpace should be set before declaring any tensors.
				1280	*
				1281	* @param[in] id The IdSpace id
				1282	*/
				1283	void set_IdSpace(int32_t id)
				1284	{
				1285	_IdSpace = id;
				1286	}
				1287	/**
				1288	* @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
				1289	*
				1290	* @return The IdSpace id
				1291	*/
				1292	int32_t IdSpace() const
				1293	{
				1294	return _IdSpace;
				1295	}
				1296	/**
				1297	* @brief Gets all the IdSpace declarations defined in the tensor registry.
				1298	*
				1299	* @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
				1300	*/
				1301	std::vector<int32_t> IdSpace_declarations() const
				1302	{
				1303	std::vector<int32_t> x;
				1304
				1305	auto it = _refs.begin();
				1306
				1307	while (it != _refs.end())
				1308	{
				1309	x.push_back(it->first);
				1310
				1311	it++;
				1312	}
				1313
				1314	return x;
				1315	}
				1316	/**
				1317	* @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
				1318	*
				1319	* @note The reference name used for declaring the tensor should not be previously used in the IdSpace
				1320	*
				1321	* @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
				1322	* @param[in] x Pair of tensor info and tensor id
				1323	* @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
				1324	*/
				1325	void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
				1326	{
				1327	assert(_language == GpuTargetLanguage::OpenCL);
				1328	const int32_t key_IdSpace = _IdSpace;
				1329	const int32_t tensor_id = x.id;
				1330	const std::string key_var_name = name;
				1331	const std::string var_name = generate_tensor_name(name, tensor_id);
				1332
				1333	// First, check whether the tensor has already a reference. If so, trigger an assert
				1334	assert(!has_tensor_argument(name));
				1335
				1336	// Check whether a tensor with that tensorID exists
				1337	auto result = _tensor_arguments.find(tensor_id);
				1338	if(result == _tensor_arguments.end())
				1339	{
				1340	// It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
				1341	std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
				1342	_tensor_arguments[tensor_id] = std::move(arg);
				1343	}
				1344
				1345	_refs[key_IdSpace][key_var_name] = tensor_id;
				1346	}
				1347	/**
				1348	* @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
				1349	*
				1350	* @param[in] name The name of the tensor to retrieve
				1351	*
				1352	* @return IGpuTensor* The tensor
				1353	*/
				1354	IGpuTensorArgument* operator[](const std::string& name)
				1355	{
				1356	const int32_t key_IdSpace = _IdSpace;
				1357	const std::string key_var_name = name;
				1358
				1359	IGpuTensorArgument* result = nullptr;
				1360	auto search_IdSpace = _refs.find(key_IdSpace);
				1361	if(search_IdSpace != _refs.end())
				1362	{
				1363	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1364
				1365	if(search_tensor_id != _refs[key_IdSpace].end())
				1366	{
				1367	const int32_t tensor_id = search_tensor_id->second;
				1368	auto search_tensor_argument = _tensor_arguments.find(tensor_id);
				1369	if(search_tensor_argument != _tensor_arguments.end())
				1370	{
				1371	result = search_tensor_argument->second.get();
				1372	}
				1373	assert(result != nullptr);
				1374	}
				1375	}
				1376
				1377	return result;
				1378	}
				1379	/**
				1380	* @brief Get all the tensors declared in the IdSpace provided by the user
				1381	*
				1382	* @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
				1383	*/
				1384	std::vector<IGpuTensorArgument*> tensor_argument_declarations()
				1385	{
				1386	std::vector<IGpuTensorArgument*> args;
				1387
				1388	auto it = _tensor_arguments.begin();
				1389
				1390	while (it != _tensor_arguments.end())
				1391	{
				1392	args.push_back(it->second.get());
				1393	it++;
				1394	}
				1395
				1396	return args;
				1397	}
				1398	/**
				1399	* @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
				1400	*
				1401	* @param[in] name Name of the tensor argument to search for
				1402	*
				1403	* @return true if the tensor argument exists
				1404	* @return false if the tensor argument does not exist
				1405	*/
				1406	bool has_tensor_argument(const std::string& name)
				1407	{
				1408	const int32_t key_IdSpace = _IdSpace;
				1409	const std::string key_var_name = name;
				1410
				1411	auto search_IdSpace = _refs.find(key_IdSpace);
				1412
				1413	if(search_IdSpace != _refs.end())
				1414	{
				1415	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1416
				1417	return search_tensor_id != _refs[key_IdSpace].end();
				1418	}
				1419	else
				1420	{
				1421	return false;
				1422	}
				1423	}
				1424	/**
				1425	* @brief Check whether the tensor argument is in the the IdSpace provided by the user
				1426	*
				1427	* @param[in] name Name of the tensor argument to search for
				1428	* @param[in] IdSpace The IdSpace id where to search the tensor argument
				1429	*
				1430	* @return true if the tile exists
				1431	* @return false if the tile does not exist
				1432	*/
				1433	bool has_tensor_argument(const std::string& name, int32_t IdSpace)
				1434	{
				1435	const int32_t key_IdSpace = IdSpace;
				1436	const std::string key_var_name = name;
				1437
				1438	auto search_IdSpace = _refs.find(key_IdSpace);
				1439
				1440	if(search_IdSpace != _refs.end())
				1441	{
				1442	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1443
				1444	return search_tensor_id != _refs[key_IdSpace].end();
				1445	}
				1446	else
				1447	{
				1448	return false;
				1449	}
				1450	}
				1451	private:
				1452	// This method ensures that the key is unique among different components
				1453	std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
				1454	{
				1455	assert(tensor_id >= 0 );
				1456
				1457	return name + std::to_string(tensor_id);
				1458	}
				1459
				1460	std::map<int32_t, TensorEntry> _tensor_arguments {};
				1461	std::map<int32_t, std::map<std::string, int32_t>> _refs {};
				1462	int32_t _IdSpace { -1 };
				1463	GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
				1464	};
				1465
				1466	enum class OpType : int32_t
				1467	{
				1468	Elementwise = 0x0000,
				1469	Relational = 0x1000,
				1470	Algebra = 0x2000
				1471	};
				1472
				1473	inline std::string to_string(AssignmentOp op)
				1474	{
				1475	switch(op)
				1476	{
				1477	case AssignmentOp::Decrement:
				1478	return "-=";
				1479	case AssignmentOp::Increment:
				1480	return "+=";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1481	default:
				1482	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	1483	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1484	}
				1485	}
				1486
				1487	inline std::string to_string(BinaryOp op)
				1488	{
				1489	switch(op)
				1490	{
				1491	case BinaryOp::Add:
				1492	return "+";
				1493	case BinaryOp::Sub:
				1494	return "-";
				1495	case BinaryOp::Mul:
				1496	return "*";
				1497	case BinaryOp::Div:
				1498	return "/";
				1499	case BinaryOp::Mod:
				1500	return "%";
				1501	case BinaryOp::Equal:
				1502	return "==";
				1503	case BinaryOp::Less:
				1504	return "<";
				1505	case BinaryOp::LessEqual:
				1506	return "<=";
				1507	case BinaryOp::Greater:
				1508	return ">";
				1509	case BinaryOp::GreaterEqual:
				1510	return ">=";
				1511	case BinaryOp::LogicalAnd:
				1512	return "&&";
				1513	case BinaryOp::LogicalOr:
				1514	return "\|\|";
				1515	case BinaryOp::LogicalNot:
				1516	return "!";
				1517	default:
				1518	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	1519	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1520	}
				1521	}
				1522
				1523	inline std::string binary_op_string(BinaryOp op)
				1524	{
				1525	switch(op)
				1526	{
				1527	case BinaryOp::Add:
				1528	return "add";
				1529	case BinaryOp::Sub:
				1530	return "sub";
				1531	case BinaryOp::Mul:
				1532	return "mul";
				1533	case BinaryOp::Div:
				1534	return "div";
				1535	case BinaryOp::Mod:
				1536	return "mod";
				1537	case BinaryOp::Equal:
				1538	return "eq";
				1539	case BinaryOp::Less:
				1540	return "gt";
				1541	case BinaryOp::LessEqual:
				1542	return "gteq";
				1543	case BinaryOp::Greater:
				1544	return "lt";
				1545	case BinaryOp::GreaterEqual:
				1546	return "lte";
				1547	default:
				1548	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	1549	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1550	}
				1551	}
				1552
				1553	enum class OperandType : int32_t
				1554	{
				1555	Unknown = 0x00000000,
				1556	ScalarFp32 = 0x00001011, // Immediate scalar tile
				1557	ScalarFp16 = 0x00001012, // Immediate scalar tile
				1558	ScalarInt32 = 0x00001021, // Immediate scalar tile
				1559	ScalarInt16 = 0x00001022, // Immediate scalar tile
				1560	ScalarInt8 = 0x00001024, // Immediate scalar tile
				1561	ScalarUInt32 = 0x00001031, // Immediate scalar tile
				1562	ScalarUInt16 = 0x00001032, // Immediate scalar tile
				1563	ScalarUInt8 = 0x00001034, // Immediate scalar tile
				1564	ScalarBool = 0x00001041, // Immediate scalar tile
				1565	ScalarTile = 0x00001050, // Scalar from a tile
				1566	Tile = 0x00010000, // Tile
				1567	TensorStride1 = 0x00100001, // Tensor component
				1568	TensorStride2 = 0x00100002, // Tensor component
				1569	TensorStride3 = 0x00100003, // Tensor component
				1570	TensorStride4 = 0x00100004, // Tensor component
				1571	TensorDim0 = 0x00100010, // Tensor component
				1572	TensorDim1 = 0x00100020, // Tensor component
				1573	TensorDim2 = 0x00100030, // Tensor component
				1574	TensorDim3 = 0x00100040, // Tensor component
				1575	TensorDim4 = 0x00100050, // Tensor component
				1576	TensorC = 0x00100010, // Tensor component
				1577	TensorW = 0x00100020, // Tensor component
				1578	TensorH = 0x00100030, // Tensor component
				1579	TensorD = 0x00100040, // Tensor component
				1580	TensorN = 0x00100050, // Tensor component
				1581	TensorDim1xDim2 = 0x00100100, // Tensor component
				1582	TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
				1583	TensorWxH = 0x00100300, // Tensor component
				1584	TensorWxHxD = 0x00100400, // Tensor component
				1585	TensorDataOffset = 0x00100500, // Tensor component
				1586	};
				1587
				1588	struct ScalarTileCoord
				1589	{
				1590	ScalarTileCoord() {}
				1591	ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
				1592	int32_t x { -1 };
				1593	int32_t y { -1 };
				1594	};
				1595	/**
				1596	* @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
				1597	* Operand can be of three types:
				1598	* -# Scalar immediate: constant expression
				1599	* -# Tile: A tile
				1600	* -# Tensor component: A component (scalar) of a tensor
				1601	*
				1602	*/
				1603	class Operand
				1604	{
				1605	public:
				1606	Operand(const std::string &val)
				1607	{
				1608	_str = val;
				1609	_type = OperandType::Tile;
				1610	}
				1611
				1612	Operand(const std::string &val, const ScalarTileCoord& coord)
				1613	{
				1614	_str = val;
				1615	_type = OperandType::ScalarTile;
				1616	_coord = coord;
				1617	}
				1618
				1619	Operand(const std::string &val, OperandType type)
				1620	{
				1621	_str = val;
				1622	_type = type;
				1623	}
				1624
				1625	Operand(const Operand& t)
				1626	{
				1627	_str = t.value();
				1628	_type = t.type();
				1629	}
				1630
				1631	Operand& operator=(const Operand& t)
				1632	{
				1633	_str = t.value();
				1634	_type = t.type();
				1635	_coord = t.scalar_tile_coordinate();
				1636	return *this;
				1637	}
				1638
				1639	std::string value() const
				1640	{
				1641	return _str;
				1642	}
				1643
				1644	OperandType type() const
				1645	{
				1646	return _type;
				1647	}
				1648
				1649	ScalarTileCoord scalar_tile_coordinate() const
				1650	{
				1651	return _coord;
				1652	}
				1653
				1654	private:
				1655	std::string _str {};
				1656	OperandType _type { OperandType::Unknown };
				1657	ScalarTileCoord _coord {};
				1658	};
				1659
				1660	enum class GpuSamplerTensorStorage : int32_t
				1661	{
				1662	Unknown = static_cast<int32_t>(GpuTensorStorage::Unknown),
				1663	BufferUint8Ptr = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
				1664	Image2dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
				1665	Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
				1666	Image3dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
				1667	Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
				1668	};
				1669
				1670	struct GpuSampler
				1671	{
				1672	GpuSampler() = default;
				1673	TensorSamplerFormat format { TensorSamplerFormat::Unknown };
				1674	GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
				1675	TensorSamplerAddressModeX address_mode_x { TensorSamplerAddressModeX::Unknown };
				1676	TensorSamplerAddressModeY address_mode_y { TensorSamplerAddressModeY::Unknown };
				1677	TensorSamplerAddressModeZ address_mode_z { TensorSamplerAddressModeZ::Unknown };
				1678	};
				1679
				1680	inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
				1681	{
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	1682	CKW_UNUSED(step_x, step_y, step_z);
				1683
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	1684	auto tensor = tensor_info_id->shape;
				1685
				1686	GpuSampler dst_sampler;
				1687	dst_sampler.format = sampler.format;
				1688	dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr;
				1689	dst_sampler.address_mode_x = sampler.address_mode_x;
				1690	dst_sampler.address_mode_y = sampler.address_mode_y;
				1691	dst_sampler.address_mode_z = sampler.address_mode_z;
				1692
				1693	int32_t dim_x = 0;
				1694	int32_t dim_y = 0;
				1695	int32_t dim_z = 0;
				1696
				1697	switch(sampler.format)
				1698	{
				1699	case TensorSamplerFormat::C_W_H:
				1700	dim_x = tensor[0];
				1701	dim_y = tensor[1];
				1702	dim_z = tensor[2];
				1703	break;
				1704	case TensorSamplerFormat::C_WH_1:
				1705	dim_x = tensor[0];
				1706	dim_y = tensor[1] * tensor[2];
				1707	dim_z = 1;
				1708	break;
				1709	default:
				1710	std::cout << "Unsupported tensor format" << std::endl;
				1711	assert(false);
				1712	break;
				1713	}
				1714
				1715	if(dim_x == 1)
				1716	{
				1717	assert(step_x == 1);
				1718	dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
				1719	}
				1720
				1721	if(dim_y == 1)
				1722	{
				1723	assert(step_y == 1);
				1724	dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
				1725	}
				1726
				1727	if(dim_z == 1)
				1728	{
				1729	assert(step_z == 1);
				1730	dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
				1731	}
				1732
				1733	return dst_sampler;
				1734	}
				1735
				1736	class GpuOutputSampler
				1737	{
				1738	public:
				1739	GpuOutputSampler() = default;
				1740	/**
				1741	* @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
				1742	* by the root component. Once initialized, all simpler components will need to used this sampler
				1743	* or a broadcasted version of it
				1744	*
				1745	* @param[in] sampler GpuSampler
				1746	* @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile!
				1747	* @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
				1748	* @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
				1749	*/
				1750	void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
				1751	{
				1752	assert(_is_initialized == false);
				1753
				1754	_step_x = step_x;
				1755	_step_y = step_y;
				1756	_step_z = step_z;
				1757	_tensor_info_id = tensor_info_id;
				1758	_sampler = create_sampler(tensor_storage, tensor_format);
				1759	_is_initialized = true;
				1760	};
				1761
				1762	GpuSampler sampler() const
				1763	{
				1764	return _sampler;
				1765	};
				1766
				1767	int32_t step_x() const
				1768	{
				1769	return _step_x;
				1770	};
				1771
				1772	int32_t step_y() const
				1773	{
				1774	return _step_y;
				1775	};
				1776
				1777	int32_t step_z() const
				1778	{
				1779	return _step_z;
				1780	};
				1781	private:
				1782	GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
				1783	{
				1784	// Output can only be in output mode
				1785	assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
				1786	assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
				1787
				1788	auto tensor = _tensor_info_id->shape;
				1789
				1790	GpuSampler sampler;
				1791	sampler.format = tensor_format;
				1792	sampler.storage = tensor_storage;
				1793	sampler.address_mode_x = TensorSamplerAddressModeX::None;
				1794	sampler.address_mode_y = TensorSamplerAddressModeY::None;
				1795	sampler.address_mode_z = TensorSamplerAddressModeZ::None;
				1796
				1797	// In the case of texture, we do not need any special checks at the border
				1798	if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
				1799	{
				1800	int32_t dim_x = 0;
				1801	int32_t dim_y = 0;
				1802	int32_t dim_z = 0;
				1803
				1804	switch(tensor_format)
				1805	{
				1806	case TensorSamplerFormat::C_W_H:
				1807	dim_x = tensor[0];
				1808	dim_y = tensor[1];
				1809	dim_z = tensor[2];
				1810	break;
				1811	case TensorSamplerFormat::C_WH_1:
				1812	dim_x = tensor[0];
				1813	dim_y = tensor[1] * tensor[2];
				1814	dim_z = 1;
				1815	break;
				1816	default:
				1817	std::cout << "Unsupported tensor format" << std::endl;
				1818	assert(false);
				1819	break;
				1820	}
				1821
				1822	if((dim_x % _step_x) != 0 && dim_x != 1)
				1823	{
				1824	sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
				1825	}
				1826
				1827	if((dim_y % _step_y) != 0 && dim_y != 1)
				1828	{
				1829	sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
				1830	}
				1831
				1832	if((dim_z % _step_z) != 0 && dim_z != 1)
				1833	{
				1834	sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
				1835	}
				1836	}
				1837
				1838	return sampler;
				1839	}
				1840	GpuSampler _sampler { }; // GpuSampler
				1841	int32_t _step_x { 1 };
				1842	int32_t _step_y { 1 };
				1843	int32_t _step_z { 1 };
				1844	const TensorInfo* _tensor_info_id { nullptr };
				1845	bool _is_initialized { false };
				1846	};
				1847
				1848	/**
				1849	* @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
				1850	*/
				1851	class TensorOperand
				1852	{
				1853	public:
				1854	TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
				1855	{
				1856	}
				1857
				1858	TensorOperand& operator=(const TensorOperand& t)
				1859	{
				1860	_str = t.value();
				1861	_sampler = t.sampler();
				1862	return *this;
				1863	}
				1864
				1865	std::string value() const
				1866	{
				1867	return _str;
				1868	}
				1869
				1870	GpuSampler sampler() const
				1871	{
				1872	return _sampler;
				1873	}
				1874
				1875	private:
				1876	std::string _str {};
				1877	GpuSampler _sampler {};
				1878	};
				1879
				1880	/**
				1881	* @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
				1882	* This data structure must be initialized before being passed to the Gpu Kernel Writer
				1883	*
				1884	*/
				1885	class GpuKernelWriterDataHolder
				1886	{
				1887	public:
				1888	/**
				1889	* @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
				1890	* the GPU target and target specific capabilities (extensions). For now, we just initialize the
				1891	* programming language
				1892	*
				1893	* @param[in] language Gpu programming language to use
				1894	*/
				1895	GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
				1896	{
				1897	}
				1898	/**
				1899	* @brief Get the Gpu programming language used
				1900	*
				1901	* @return GpuTargetLanguage the Gpu programming language
				1902	*/
				1903	GpuTargetLanguage programming_language() const
				1904	{
				1905	return _language;
				1906	}
				1907	/**
				1908	* @brief @ref GpuTileRegistry
				1909	*
				1910	*/
				1911	GpuTileRegistry tiles{};
				1912	/**
				1913	* @brief @ref GpuTensorArgumentRegistry
				1914	*
				1915	*/
				1916	GpuTensorArgumentRegistry arguments{};
				1917	/**
				1918	* @brief @ref GpuOutputSampler.
				1919	*
				1920	*/
				1921	GpuOutputSampler output_sampler{};
				1922	/**
				1923	* @brief Source code
				1924	*
				1925	*/
				1926	std::string code{};
				1927
				1928	// GpuExtensionRegistry extensions{};
				1929	private:
				1930	GpuTargetLanguage _language;
				1931	};
				1932
				1933	struct LWS
				1934	{
				1935	int32_t x {1};
				1936	int32_t y {1};
				1937	int32_t z {1};
				1938	};
				1939
				1940	/**
				1941	* @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
				1942	* declare an anonymous tile in the tile registry.
				1943	*/
				1944	class OperandUnpacker
				1945	{
				1946	public:
				1947	OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
				1948	{
				1949	// Increase the level of the stack to allocate possible temporary tiles
				1950	_tiles.increment_registry_level();
				1951	};
				1952
				1953	~OperandUnpacker()
				1954	{
				1955	// Decrease the level of the stack to deallocate any temporary tiles
				1956	_tiles.decrement_registry_level();
				1957	}
				1958
				1959	IVectorTile* unpack(const Operand& src)
				1960	{
				1961	// Get the tile
				1962	if(src.type() == OperandType::Tile)
				1963	{
				1964	assert(_tiles.has_tile(src.value()));
				1965	return _tiles[src.value()];
				1966	}
				1967	// Create an anonymous tile with a constant
				1968	else if(static_cast<int32_t>(src.type()) & 0x00001000)
				1969	{
				1970	if(src.type() == OperandType::ScalarTile)
				1971	{
				1972	ScalarTileCoord coord = src.scalar_tile_coordinate();
				1973	assert(_tiles.has_tile(src.value()));
				1974	assert(coord.x >= 0);
				1975	assert(coord.y >= 0);
				1976	auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
				1977	return _tiles.insert({{{val.str}}}, val.type.dt);
				1978	}
				1979	else
				1980	{
				1981	return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
				1982	}
				1983	}
				1984	// Create an anonymous tile with the tensor component
				1985	else
				1986	{
				1987	assert(_arguments.has_tensor_argument(src.value()));
				1988	auto x = _arguments[src.value()];
				1989	const std::string val = x->component(to_tensor_component(src.type()));
				1990	const DataType dt = x->component_data_type();
				1991	return _tiles.insert({{{val}}}, dt);
				1992	}
				1993	}
				1994
				1995	private:
				1996	DataType to_tile_data_type(OperandType x)
				1997	{
				1998	return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
				1999	}
				2000
				2001	TensorComponent to_tensor_component(OperandType x)
				2002	{
				2003	switch(x)
				2004	{
				2005	case OperandType::TensorDim0:
				2006	return TensorComponent::Dim0;
				2007	case OperandType::TensorDim1:
				2008	return TensorComponent::Dim1;
				2009	case OperandType::TensorDim2:
				2010	return TensorComponent::Dim2;
				2011	case OperandType::TensorDim3:
				2012	return TensorComponent::Dim3;
				2013	case OperandType::TensorDim4:
				2014	return TensorComponent::Dim4;
				2015	case OperandType::TensorStride1:
				2016	return TensorComponent::Stride1;
				2017	case OperandType::TensorStride2:
				2018	return TensorComponent::Stride2;
				2019	case OperandType::TensorStride3:
				2020	return TensorComponent::Stride3;
				2021	case OperandType::TensorStride4:
				2022	return TensorComponent::Stride4;
				2023	case OperandType::TensorDim1xDim2:
				2024	return TensorComponent::Dim1xDim2;
				2025	case OperandType::TensorDim1xDim2xDim3:
				2026	return TensorComponent::Dim1xDim2xDim3;
				2027	case OperandType::TensorDataOffset:
				2028	return TensorComponent::OffsetFirstElement;
				2029	default:
				2030	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2031	return TensorComponent::Unknown;
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2032	}
				2033	}
				2034
				2035	GpuTileRegistry& _tiles;
				2036	GpuTensorArgumentRegistry& _arguments;
				2037	};
				2038
				2039	/**
				2040	* @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
				2041	* declare an anonymous tile in the tile registry.
				2042	* Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
				2043	*/
				2044	class TensorOperandUnpacker
				2045	{
				2046	public:
				2047	TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
				2048	{
				2049	};
				2050
				2051	IGpuTensorArgument* unpack(const TensorOperand& src)
				2052	{
				2053	assert(_arguments.has_tensor_argument(src.value()));
				2054	return _arguments[src.value()];
				2055	}
				2056
				2057	private:
				2058	GpuTensorArgumentRegistry& _arguments;
				2059	};
				2060
				2061	/**
				2062	* @brief The GpuKernel will be used in three occasions (stages):
				2063	* #- Compilation stage
				2064	* #- Tuning stage
				2065	* #- Dispatch stage
				2066	*/
				2067	struct GpuKernel
				2068	{
				2069	// Compilation stage
				2070	std::string code {}; // Source code, required for the compilation stage
				2071	std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
				2072	// Tuning stage
				2073	std::string config_id {}; // Unique id, required for the tuning stage
				2074	std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage
				2075	// Dispatch stage
				2076	GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
				2077	std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages; // List of tensor storages, required for the dispatch stage
				2078	std::vector<std::pair<int32_t, TensorComponent>> list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
				2079	};
				2080
				2081	// This function should produce an object with the source
				2082	inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
				2083	{
				2084	std::string code;
				2085	code += "__kernel void ";
				2086	code += name;
				2087	code += "(\n";
				2088
				2089	auto IdSpaces = in.arguments.IdSpace_declarations();
				2090
				2091	std::vector<std::string> arg_str;
				2092
				2093	auto tensor_args = in.arguments.tensor_argument_declarations();
				2094
				2095	for(auto &i : tensor_args)
				2096	{
				2097	// For each tensor used, get the storage and tensor components
				2098	auto storages = i->storage_declarations();
				2099	auto components = i->component_declarations();
				2100
				2101	for(auto &y : storages)
				2102	{
				2103	std::string str;
				2104	str += i->storage_type_declaration(y);
				2105	str += " ";
				2106	str += i->storage(y);
				2107	arg_str.push_back(str);
				2108	}
				2109
				2110	for(auto &y : components)
				2111	{
				2112	std::string str;
				2113	str += i->component_type_declaration();
				2114	str += " ";
				2115	str += i->component(y);
				2116	arg_str.push_back(str);
				2117	}
				2118	}
				2119
				2120	for(size_t i = 0; i < arg_str.size(); ++i)
				2121	{
				2122	code += arg_str[i];
				2123	if(i + 1 < arg_str.size())
				2124	{
				2125	code += ",\n";
				2126	}
				2127	}
				2128
				2129	code += ")\n";
				2130	code += "{\n";
				2131	code += in.code;
				2132	code += "}\n";
				2133
				2134	return code;
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2135	}
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2136
				2137	/**
				2138	* @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
				2139	* how to reduce the dimensionality of a tensor
				2140	*
				2141	*/
				2142	class GpuTensor3dMapper
				2143	{
				2144	public:
				2145	GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
				2146	{
				2147	};
				2148
				2149	std::string tensor_component_x() const
				2150	{
				2151	const auto format = _sampler.format;
				2152	switch(format)
				2153	{
				2154	case TensorSamplerFormat::C_WH_1:
				2155	case TensorSamplerFormat::C_W_H:
				2156	return _tensor->component(TensorComponent::C);
				2157	default:
				2158	std::cout << "Unsupported tensor format" << std::endl;
				2159	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2160	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2161	}
				2162	}
				2163
				2164	std::string tensor_component_y() const
				2165	{
				2166	const auto format = _sampler.format;
				2167	switch(format)
				2168	{
				2169	case TensorSamplerFormat::C_WH_1:
				2170	return _tensor->component(TensorComponent::WxH);
				2171	case TensorSamplerFormat::C_W_H:
				2172	return _tensor->component(TensorComponent::W);
				2173	default:
				2174	std::cout << "Unsupported tensor format" << std::endl;
				2175	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2176	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2177	}
				2178	}
				2179
				2180	std::string tensor_component_z() const
				2181	{
				2182	const auto format = _sampler.format;
				2183	switch(format)
				2184	{
				2185	case TensorSamplerFormat::C_WH_1:
				2186	return "1";
				2187	case TensorSamplerFormat::C_W_H:
				2188	return _tensor->component(TensorComponent::H);
				2189	default:
				2190	std::cout << "Unsupported tensor format" << std::endl;
				2191	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2192	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2193	}
				2194	}
				2195
				2196	std::string tensor_component_stride_y() const
				2197	{
				2198	const auto format = _sampler.format;
				2199	switch(format)
				2200	{
				2201	case TensorSamplerFormat::C_WH_1:
				2202	case TensorSamplerFormat::C_W_H:
				2203	return _tensor->component(TensorComponent::Stride1);
				2204	default:
				2205	std::cout << "Unsupported tensor format" << std::endl;
				2206	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2207	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2208	}
				2209	}
				2210
				2211	std::string tensor_component_stride_z() const
				2212	{
				2213	const auto format = _sampler.format;
				2214	switch(format)
				2215	{
				2216	case TensorSamplerFormat::C_WH_1:
				2217	return "0";
				2218	case TensorSamplerFormat::C_W_H:
				2219	return _tensor->component(TensorComponent::Stride2);
				2220	default:
				2221	std::cout << "Unsupported tensor format" << std::endl;
				2222	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2223	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2224	}
				2225	}
				2226
				2227	std::string tensor_component_stride_batch() const
				2228	{
				2229	const auto format = _sampler.format;
				2230	switch(format)
				2231	{
				2232	case TensorSamplerFormat::C_WH_1:
				2233	case TensorSamplerFormat::C_W_H:
				2234	return _tensor->component(TensorComponent::Stride3);
				2235	default:
				2236	std::cout << "Unsupported tensor format" << std::endl;
				2237	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2238	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2239	}
				2240	}
				2241
				2242	bool is_one_component_x() const
				2243	{
				2244	auto t = _tensor->format();
				2245	const auto format = _sampler.format;
				2246	switch(format)
				2247	{
				2248	case TensorSamplerFormat::C_WH_1:
				2249	case TensorSamplerFormat::C_W_H:
				2250	return t.shape[0] == 1;
				2251	default:
				2252	std::cout << "Unsupported tensor format" << std::endl;
				2253	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2254	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2255	}
				2256	}
				2257
				2258	bool is_one_component_y() const
				2259	{
				2260	auto t = _tensor->format();
				2261	const auto format = _sampler.format;
				2262	switch(format)
				2263	{
				2264	case TensorSamplerFormat::C_WH_1:
				2265	return (t.shape[1] * t.shape[2]) == 1;
				2266	case TensorSamplerFormat::C_W_H:
				2267	return t.shape[1] == 1;
				2268	default:
				2269	std::cout << "Unsupported tensor format" << std::endl;
				2270	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2271	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2272	}
				2273	}
				2274
				2275	bool is_one_component_z() const
				2276	{
				2277	auto t = _tensor->format();
				2278	const auto format = _sampler.format;
				2279	switch(format)
				2280	{
				2281	case TensorSamplerFormat::C_WH_1:
				2282	return true;
				2283	case TensorSamplerFormat::C_W_H:
				2284	return t.shape[2] == 1;
				2285	default:
				2286	std::cout << "Unsupported tensor format" << std::endl;
				2287	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2288	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2289	}
				2290	}
				2291
				2292	bool is_one_component_batch() const
				2293	{
				2294	auto t = _tensor->format();
				2295	const auto format = _sampler.format;
				2296	switch(format)
				2297	{
				2298	case TensorSamplerFormat::C_WH_1:
				2299	case TensorSamplerFormat::C_W_H:
				2300	return t.shape[3] == 1;
				2301	default:
				2302	std::cout << "Unsupported tensor format" << std::endl;
				2303	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2304	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2305	}
				2306	}
				2307
				2308	GpuSampler gpu_sampler() const
				2309	{
				2310	return _sampler;
				2311	}
				2312
				2313	IGpuTensorArgument* tensor_argument() const
				2314	{
				2315	return _tensor;
				2316	}
				2317
				2318	private:
				2319	GpuSampler _sampler;
				2320	IGpuTensorArgument* _tensor;
				2321	};
				2322
				2323	struct GpuKernelWriterAttribute
				2324	{
				2325	bool return_tensor_component_by_value { false };
				2326	};
				2327
				2328	enum class ConvertPolicy
				2329	{
				2330	Wrap, /*< Wrap around /
				2331	Saturate /*< Saturate /
				2332	};
				2333
				2334	enum class RoundingMode
				2335	{
				2336	None,
				2337	Rte,
				2338	Rtz,
				2339	Rtp,
				2340	Rtn
				2341	};
				2342
				2343	// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
				2344	class IGpuKernelWriter
				2345	{
				2346	public:
				2347	virtual ~IGpuKernelWriter() = default;
				2348	virtual void set_IdSpace(int32_t id) = 0;
				2349	virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
				2350	virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
				2351	virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
				2352	virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
				2353	virtual void write_text(const std::string& x) = 0;
				2354	virtual void compound_statement_begin() = 0;
				2355	virtual void compound_statement_end() = 0;
				2356
				2357	// Operations
				2358	virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
				2359	virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
				2360	virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
				2361	virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
				2362	virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
				2363	virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
				2364	virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
				2365	virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
				2366	virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
				2367	virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
				2368	virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
				2369	virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
				2370	virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
				2371	virtual void op_return() = 0;
				2372	// virtual void op_else() = 0;
				2373	// virtual void op_elseif() = 0;
				2374	// Utils
				2375	// It is the process of converting
				2376	virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
				2377	};
				2378
				2379	enum class GpuLoadStoreType
				2380	{
				2381	Load = 1,
				2382	Store = 2
				2383	};
				2384
				2385	class IGpuLoadStoreHelperWriter
				2386	{
				2387	public:
				2388	IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
				2389	IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
				2390	IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
				2391	virtual ~IGpuLoadStoreHelperWriter() = default;
				2392	virtual void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) = 0;
				2393	virtual void write(const std::pair<int32_t, std::string>& y) = 0;
				2394	virtual void finalize() = 0;
				2395	protected:
				2396	IGpuKernelWriter* _writer;
				2397	GpuTensor3dMapper _mapper;
				2398	GpuLoadStoreType _type;
				2399	};
				2400
				2401	class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
				2402	{
				2403	public:
				2404	ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
				2405	{
				2406	}
				2407
				2408	ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
				2409	ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
				2410
				2411	static bool validate(IGpuKernelWriter x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile dst)
				2412	{
				2413	CKW_UNUSED(x, type, dst);
				2414
				2415	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
				2416	{
				2417	return false;
				2418	}
				2419	return true;
				2420	}
				2421
				2422	void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) override
				2423	{
				2424	assert(validate(_writer, _mapper, _type, dst));
				2425
				2426	_dst = dst;
				2427	_ls_width_full = dst->format().w;
				2428
				2429	_coord_x = x->scalar(0, 0).str;
				2430	_coord_z = z->scalar(0, 0).str;
				2431	_coord_b = b->scalar(0, 0).str;
				2432	_coord_orig_z = _coord_z;
				2433
				2434	out_of_bound_initialize_x(_coord_x);
				2435	out_of_bound_initialize_z(_coord_z);
				2436
				2437	/*
				2438	meaning of else:
				2439	- x: partial load/store
				2440	- y: no load/store operation
				2441	- z: no load/store operation
				2442	if(x)
				2443	{
				2444	if(z)
				2445	{
				2446	if(y)
				2447	{
				2448	// full load/store width
				2449	}
				2450	else
				2451	{
				2452	// no load/store
				2453	}
				2454	}
				2455	else
				2456	{
				2457	// no load/store
				2458	}
				2459	}
				2460	else
				2461	{
				2462	if(z)
				2463	{
				2464	if(y)
				2465	{
				2466	// partial load/store width
				2467	}
				2468	else
				2469	{
				2470	// no load/store
				2471	}
				2472	}
				2473	else
				2474	{
				2475	// no load/store
				2476	}
				2477	}
				2478	*/
				2479	}
				2480
				2481	void write(const std::pair<int32_t, std::string>& y) override
				2482	{
				2483	int32_t idx_y = y.first;
				2484	std::string coord_y = y.second;
				2485
				2486	// The only check required is on Y.
				2487	out_of_bound_initialize_y(coord_y);
				2488
				2489	const std::string dst = _dst->vector(idx_y).str;
				2490	const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
				2491	const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address);
				2492
				2493	_writer->write_text(ls_buf);
				2494	_writer->write_text(";\n");
				2495
				2496	out_of_bound_finalize_y(dst);
				2497
				2498	// The left over load/store will be written in the finalize stage
				2499	if(_ls_width_part.size() != 0)
				2500	{
				2501	int32_t w = 0;
				2502	for(auto &p : _ls_width_part)
				2503	{
				2504	const std::string dst0 = _dst->vector(w, p, idx_y).str;
				2505	const std::string coord_x = _coord_x + " + " + std::to_string(w);
				2506	const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
				2507	const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
				2508	_leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
				2509
				2510	w += p;
				2511	}
				2512	}
				2513	}
				2514
				2515	void finalize() override
				2516	{
				2517	out_of_bound_finalize_z();
				2518	out_of_bound_finalize_x();
				2519	}
				2520	private:
				2521	IVectorTile* _dst { nullptr };
				2522	int32_t _ls_width_full { 0 };
				2523	std::vector<int32_t> _ls_width_part { };
				2524	std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
				2525	std::string _coord_x {};
				2526	std::string _coord_z {};
				2527	std::string _coord_orig_z {};
				2528	std::string _coord_b {};
				2529
				2530	void out_of_bound_initialize_x(std::string& coord)
				2531	{
				2532	if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				2533	{
				2534	auto tensor_format = _mapper.tensor_argument()->format();
				2535	auto shape = tensor_format.shape;
				2536
				2537	_ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
				2538	if(_ls_width_part.size() != 0)
				2539	{
				2540	_writer->write_text("if(" + coord + " > 0)\n");
				2541	_writer->compound_statement_begin();
				2542	}
				2543	}
				2544	};
				2545
				2546	void out_of_bound_finalize_x()
				2547	{
				2548	if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				2549	{
				2550	if(_ls_width_part.size() != 0)
				2551	{
				2552	_writer->compound_statement_end();
				2553	_writer->write_text("else\n");
				2554	_writer->compound_statement_begin();
				2555
				2556	out_of_bound_initialize_z(_coord_orig_z);
				2557	for(auto &i : _leftovers_x)
				2558	{
				2559	out_of_bound_initialize_y(i.first.second);
				2560	_writer->write_text(i.second);
				2561	_writer->write_text(";\n");
				2562	out_of_bound_finalize_y(i.first.first);
				2563	}
				2564	out_of_bound_finalize_z();
				2565	_writer->compound_statement_end();
				2566	}
				2567	}
				2568	};
				2569
				2570	void out_of_bound_initialize_y(std::string& coord)
				2571	{
				2572	std::string max = "";
				2573
				2574	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2575
				2576	switch(address_mode_y)
				2577	{
				2578	case TensorSamplerAddressModeY::Skip:
				2579	case TensorSamplerAddressModeY::ClampToBorder:
				2580	// NOTE: This line should not be moved outside of the switch statement.
				2581	// The reason for that is because when we query the component, the component is marked as used
				2582	// and added to the list of arguments of the kernel. Since, not in all cases this component is required,
				2583	// we should request the component only when used
				2584	max = _mapper.tensor_component_y();
				2585	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2586	_writer->compound_statement_begin();
				2587	break;
				2588	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2589	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2590	_writer->write_text("if(" + coord + " >= 0)\n");
				2591	_writer->compound_statement_begin();
				2592	break;
				2593	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2594	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2595	max = _mapper.tensor_component_y();
				2596	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2597	_writer->compound_statement_begin();
				2598	break;
				2599	case TensorSamplerAddressModeY::ClampToNearest:
				2600	max = _mapper.tensor_component_y();
				2601	coord = "clamp(" + coord + ", 0, " + max + " - 1)";
				2602	break;
				2603	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				2604	max = _mapper.tensor_component_y();
				2605	coord = "min(" + coord + ", " + max + " - 1)";
				2606	break;
				2607	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				2608	coord = "max(" + coord + ", 0)";
				2609	break;
				2610	case TensorSamplerAddressModeY::None:
				2611	break;
				2612	default:
				2613	std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
				2614	assert(false);
				2615	}
				2616	};
				2617
				2618	void out_of_bound_finalize_y(const std::string& dst)
				2619	{
				2620	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2621
				2622	switch(address_mode_y)
				2623	{
				2624	case TensorSamplerAddressModeY::ClampToBorder:
				2625	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2626	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2627	case TensorSamplerAddressModeY::Skip:
				2628	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2629	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2630	_writer->compound_statement_end();
				2631	break;
				2632
				2633	default:
				2634	assert(false);
				2635	}
				2636
				2637	switch(address_mode_y)
				2638	{
				2639	case TensorSamplerAddressModeY::ClampToBorder:
				2640	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2641	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2642	_writer->write_text("else\n");
				2643	_writer->compound_statement_begin();
				2644	_writer->write_text(dst);
				2645	_writer->write_text(" = 0.0f;\n");
				2646	_writer->compound_statement_end();
				2647	break;
				2648
				2649	default:
				2650	assert(false);
				2651	}
				2652	};
				2653
				2654	void out_of_bound_initialize_z(std::string& coord)
				2655	{
				2656	std::string max = "";
				2657
				2658	const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
				2659
				2660	switch(address_mode_z)
				2661	{
				2662	case TensorSamplerAddressModeZ::Skip:
				2663	max = _mapper.tensor_component_z();
				2664	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2665	_writer->compound_statement_begin();
				2666	break;
				2667	case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
				2668	_writer->write_text("if(" + coord + " >= 0)\n");
				2669	_writer->compound_statement_begin();
				2670	break;
				2671	case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
				2672	max = _mapper.tensor_component_z();
				2673	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2674	_writer->compound_statement_begin();
				2675	break;
				2676	case TensorSamplerAddressModeZ::ClampToNearest:
				2677	max = _mapper.tensor_component_z();
				2678	coord = "clamp(" + coord + ", 0, " + max + " - 1)";
				2679	break;
				2680	case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
				2681	max = _mapper.tensor_component_z();
				2682	coord = "min(" + coord + ", " + max + " - 1)";
				2683	break;
				2684	case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
				2685	coord = "max(" + coord + ", 0)";
				2686	break;
				2687	case TensorSamplerAddressModeZ::None:
				2688	break;
				2689	default:
				2690	std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
				2691	assert(false);
				2692	}
				2693	};
				2694
				2695	void out_of_bound_finalize_z()
				2696	{
				2697	const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
				2698
				2699	switch(address_mode_z)
				2700	{
				2701	case TensorSamplerAddressModeZ::Skip:
				2702	case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
				2703	case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
				2704	_writer->compound_statement_end();
				2705	break;
				2706
				2707	default:
				2708	assert(false);
				2709	}
				2710	};
				2711
				2712	std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
				2713	{
				2714	std::vector<int32_t> x;
				2715
				2716	switch(ls_leftover_vector_width)
				2717	{
				2718	case 0:
				2719	break;
				2720	case 1:
				2721	case 2:
				2722	case 3:
				2723	case 4:
				2724	case 8:
				2725	case 16:
				2726	x.push_back(ls_leftover_vector_width);
				2727	break;
				2728	case 5:
				2729	x.push_back(4);
				2730	x.push_back(1);
				2731	break;
				2732	case 6:
				2733	x.push_back(4);
				2734	x.push_back(2);
				2735	break;
				2736	case 7:
				2737	x.push_back(4);
				2738	x.push_back(3);
				2739	break;
				2740	case 9:
				2741	x.push_back(8);
				2742	x.push_back(1);
				2743	break;
				2744	case 10:
				2745	x.push_back(8);
				2746	x.push_back(2);
				2747	break;
				2748	case 11:
				2749	x.push_back(8);
				2750	x.push_back(3);
				2751	break;
				2752	case 12:
				2753	x.push_back(8);
				2754	x.push_back(4);
				2755	break;
				2756	case 13:
				2757	x.push_back(8);
				2758	x.push_back(4);
				2759	x.push_back(1);
				2760	break;
				2761	case 14:
				2762	x.push_back(8);
				2763	x.push_back(4);
				2764	x.push_back(2);
				2765	break;
				2766	case 15:
				2767	x.push_back(8);
				2768	x.push_back(4);
				2769	x.push_back(3);
				2770	break;
				2771
				2772	default:
				2773	assert(false);
				2774	}
				2775	return x;
				2776	}
				2777
				2778	std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
				2779	{
				2780	switch(type)
				2781	{
				2782	case GpuLoadStoreType::Load:
				2783	if(vector_width != 1)
				2784	{
				2785	return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
				2786	}
				2787	else
				2788	{
				2789	return data + " = *(" + address + ")";
				2790	}
				2791	break;
				2792	case GpuLoadStoreType::Store:
				2793	if(vector_width != 1)
				2794	{
				2795	return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
				2796	}
				2797	else
				2798	{
				2799	return "*(" + address + ") = " + data;
				2800	}
				2801	break;
				2802	default:
				2803	std::cout << "Unsupported GpuLoadStoreType" << std::endl;
				2804	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	2805	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	2806	}
				2807	}
				2808
				2809	std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
				2810	{
				2811	auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
				2812	assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
				2813	const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage);
				2814	const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
				2815
				2816	std::string address;
				2817	address += "(__global ";
				2818	address += dst_type;
				2819	address += "*)(";
				2820	address += ptr_buf;
				2821	if(x != "0" && (_mapper.is_one_component_x() != true))
				2822	{
				2823	address += " + (";
				2824	address += x + ") * sizeof(" + dst_type + ")";
				2825	}
				2826	if(y != "0" && (_mapper.is_one_component_y() != true))
				2827	{
				2828	const std::string stride_y = _mapper.tensor_component_stride_y();
				2829	address += " + (";
				2830	address += y + ")";
				2831	address += " * ";
				2832	address += stride_y;
				2833	}
				2834	if(z != "0" && (_mapper.is_one_component_z() != true))
				2835	{
				2836	const std::string stride_z = _mapper.tensor_component_stride_z();
				2837	address += " + (";
				2838	address += z + ")";
				2839	address += " * ";
				2840	address += stride_z;
				2841	}
				2842	if(b != "0" && (_mapper.is_one_component_batch() != true))
				2843	{
				2844	const std::string stride_b = _mapper.tensor_component_stride_batch();
				2845	address += " + (";
				2846	address += b + ")";
				2847	address += " * ";
				2848	address += stride_b;
				2849	}
				2850	address += ")";
				2851	return address;
				2852	}
				2853	};
				2854
				2855	class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
				2856	{
				2857	public:
				2858	static bool validate(IGpuKernelWriter x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile dst)
				2859	{
				2860	CKW_UNUSED(x);
				2861
				2862	if(dst->format().w != 4)
				2863	{
				2864	return false;
				2865	}
				2866	if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
				2867	{
				2868	return false;
				2869	}
				2870	if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
				2871	{
				2872	return false;
				2873	}
				2874	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
				2875	{
				2876	return false;
				2877	}
				2878	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
				2879	{
				2880	return false;
				2881	}
				2882	if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
				2883	{
				2884	return false;
				2885	}
				2886	return true;
				2887	/*
				2888	- x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
				2889	- z: Only GpuSamplerAddressModeZ::None is supported
				2890	*/
				2891	}
				2892	ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
				2893	{
				2894	}
				2895
				2896	ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
				2897	ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
				2898
				2899	void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) override
				2900	{
				2901	assert(validate(_writer, _mapper, _type, dst));
				2902
				2903	_dst = dst;
				2904	_ls_width_full = dst->format().w;
				2905	_coord_x = x->scalar(0, 0).str;
				2906	_coord_z = z->scalar(0, 0).str;
				2907	_coord_b = b->scalar(0, 0).str;
				2908
				2909	/*
				2910	if(y)
				2911	{
				2912	// full load/store width
				2913	}
				2914	else
				2915	{
				2916	// no load/store
				2917	}
				2918	*/
				2919	}
				2920
				2921	void write(const std::pair<int32_t, std::string>& y) override
				2922	{
				2923	int32_t idx_y = y.first;
				2924	std::string coord_y = y.second;
				2925
				2926	// The only check required is on Y.
				2927	out_of_bound_initialize_y(coord_y);
				2928
				2929	const std::string dst = _dst->vector(idx_y).str;
				2930	const std::string sampler = to_ls_image2d_sampler();
				2931	const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
				2932	const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
				2933
				2934	_writer->write_text(ls_buf);
				2935	_writer->write_text(";\n");
				2936
				2937	out_of_bound_finalize_y(dst);
				2938	}
				2939
				2940	void finalize() override
				2941	{
				2942	}
				2943	private:
				2944	IVectorTile* _dst { nullptr };
				2945	int32_t _ls_width_full { 0 };
				2946	std::string _coord_x {};
				2947	std::string _coord_z {};
				2948	std::string _coord_b {};
				2949
				2950	void out_of_bound_initialize_y(std::string& coord)
				2951	{
				2952	std::string max = "";
				2953
				2954	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2955
				2956	switch(address_mode_y)
				2957	{
				2958	case TensorSamplerAddressModeY::Skip:
				2959	max = _mapper.tensor_component_y();
				2960	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2961	_writer->compound_statement_begin();
				2962	break;
				2963	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2964	_writer->write_text("if(" + coord + " >= 0)\n");
				2965	_writer->compound_statement_begin();
				2966	break;
				2967	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2968	max = _mapper.tensor_component_y();
				2969	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2970	_writer->compound_statement_begin();
				2971	break;
				2972	case TensorSamplerAddressModeY::ClampToBorder:
				2973	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2974	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2975	case TensorSamplerAddressModeY::ClampToNearest:
				2976	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				2977	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				2978	case TensorSamplerAddressModeY::None:
				2979	break;
				2980	default:
				2981	std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
				2982	assert(false);
				2983	}
				2984	};
				2985
				2986	void out_of_bound_finalize_y(const std::string& dst)
				2987	{
				2988	CKW_UNUSED(dst);
				2989
				2990	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2991
				2992	switch(address_mode_y)
				2993	{
				2994	case TensorSamplerAddressModeY::Skip:
				2995	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2996	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2997	_writer->compound_statement_end();
				2998	break;
				2999
				3000	default:
				3001	assert(false);
				3002	}
				3003	};
				3004
				3005	std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
				3006	{
				3007	CKW_UNUSED(vector_width);
				3008
				3009	auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
				3010	const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage);
				3011	// const DataType dt = _dst->format().dt;
				3012	const std::string post_fix = _dst->format().dt == DataType::Fp32? "f" : "h";
				3013
				3014	switch(type)
				3015	{
				3016	case GpuLoadStoreType::Load:
				3017	return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
				3018	break;
				3019	case GpuLoadStoreType::Store:
				3020	return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
				3021	default:
				3022	assert(false);
				3023	std::cout << "Unsupported GpuLoadStoreType" << std::endl;
				3024	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	3025	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	3026	}
				3027	}
				3028
				3029	std::string to_ls_image2d_sampler() const
				3030	{
				3031	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				3032
				3033	switch(address_mode_y)
				3034	{
				3035	case TensorSamplerAddressModeY::None:
				3036	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST";
				3037	case TensorSamplerAddressModeY::Skip:
				3038	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				3039	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				3040	case TensorSamplerAddressModeY::ClampToBorder:
				3041	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				3042	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				3043	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP \| CLK_FILTER_NEAREST";
				3044	case TensorSamplerAddressModeY::ClampToNearest:
				3045	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				3046	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				3047	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST";
				3048	default:
				3049	std::cout << "Unsupported address_mode_coord" << std::endl;
				3050	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	3051	return "";
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	3052	}
				3053	}
				3054
				3055	std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
				3056	{
				3057	std::string coord_x = "(" + x + ") >> 2";
				3058	std::string coord_y = "(";
				3059
				3060	if(y != "0" && (_mapper.is_one_component_y() != true))
				3061	{
				3062	coord_y += y;
				3063	}
				3064	if(z != "0" && (_mapper.is_one_component_z() != true))
				3065	{
				3066	const std::string dim = _mapper.tensor_component_y();
				3067	coord_y += " + (";
				3068	coord_y += z + ")";
				3069	coord_y += " * ";
				3070	coord_y += dim;
				3071	}
				3072	if(b != "0" && (_mapper.is_one_component_batch() != true))
				3073	{
				3074	const std::string dim0 = _mapper.tensor_component_y();
				3075	const std::string dim1 = _mapper.tensor_component_z();
				3076	coord_y += " + (";
				3077	coord_y += b + ")";
				3078	coord_y += " * ";
				3079	coord_y += dim0;
				3080	coord_y += " * ";
				3081	coord_y += dim1;
				3082	}
				3083	coord_y += ")";
				3084	return "(int2)(" + coord_x + ", " + coord_y + ")";
				3085	}
				3086	};
				3087
				3088	/** IGpuLoadStoreHelperWriter factory class */
				3089	class ClLoadStoreHelperWriterFactory final
				3090	{
				3091	public:
				3092	/** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
				3093	*
				3094	*
				3095	* @return IGpuLoadStoreHelperWriter
				3096	*/
				3097	static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
				3098	{
				3099	const auto tensor_storage = mapper.gpu_sampler().storage;
				3100	switch(tensor_storage)
				3101	{
				3102	case GpuSamplerTensorStorage::BufferUint8Ptr:
				3103	return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
				3104	case GpuSamplerTensorStorage::Image2dReadOnly:
				3105	case GpuSamplerTensorStorage::Image2dWriteOnly:
				3106	return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
				3107	default:
				3108	std::cout << "Unsupported Gpu tensor storage" << std::endl;
				3109	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	3110	return nullptr;
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	3111	}
				3112	}
				3113	};
				3114
				3115	// This utility method needs to go in utils.h
				3116	inline bool is_tile_scalar(IVectorTile* x)
				3117	{
				3118	return x->format().w == 1 && x->format().h == 1;
				3119	}
				3120
				3121	class ClKernelWriter : public IGpuKernelWriter
				3122	{
				3123	public:
				3124	ClKernelWriter(GpuKernelWriterAttribute attr, GpuKernelWriterDataHolder x)
				3125	{
				3126	_data = x;
				3127	_attr = attr;
				3128	}
				3129
				3130	ClKernelWriter(const ClKernelWriter &) = default;
				3131	ClKernelWriter &operator=(const ClKernelWriter &) = default;
				3132
				3133	// A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
				3134	// there are no conflicts or ambiguity in the code
				3135	void set_IdSpace(int32_t id) override
				3136	{
				3137	_data->tiles.set_IdSpace(id);
				3138	_data->arguments.set_IdSpace(id);
				3139	}
				3140
				3141	void import_tile(const std::string& dst_name, const IVectorTile *src) override
				3142	{
				3143	_data->tiles.insert(dst_name, src);
				3144	}
				3145
				3146	void declare_argument(const std::string& name, const TensorInfo& tensor) override
				3147	{
				3148	assert(_data->arguments[name] == nullptr);
				3149	_data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
				3150	}
				3151
				3152	void declare_tile(const std::string& name, const TileInfo& format) override
				3153	{
				3154	assert(_data->tiles[name] == nullptr);
				3155	_data->tiles.insert(name, format);
				3156
				3157	IVectorTile *x = _data->tiles[name];
				3158
				3159	for(auto &t : x->underlying_source_variables())
				3160	{
				3161	_data->code += t.type.str + " " + t.str + ";\n";
				3162	}
				3163	}
				3164
				3165	void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
				3166	{
				3167	assert(_data->tiles[name] == nullptr);
				3168	_data->tiles.insert(name, in, dt);
				3169	// Note: A constant does not need to be declared in the code
				3170	}
				3171
				3172	void write_text(const std::string& x) override
				3173	{
				3174	_data->code += x;
				3175	}
				3176
				3177	void compound_statement_begin() override
				3178	{
				3179	_data->tiles.increment_registry_level();
				3180	_data->code += "{\n";
				3181	}
				3182
				3183	void compound_statement_end() override
				3184	{
				3185	_data->tiles.decrement_registry_level();
				3186	_data->code += "}\n";
				3187	}
				3188
				3189	void op_get_global_id(const Operand& dst_var, int32_t dim) override
				3190	{
				3191	assert(dst_var.type() == OperandType::Tile);
				3192	assert(_data->tiles.has_tile(dst_var.value()));
				3193	assert(_data->tiles[dst_var.value()]->format().w == 1 &&
				3194	_data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
				3195
				3196	auto var = _data->tiles[dst_var.value()];
				3197
				3198	_data->code += var->scalar(0, 0).str;
				3199	_data->code += " = get_global_id(";
				3200	_data->code += std::to_string(dim);
				3201	_data->code += ");\n";
				3202	};
				3203
				3204	void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
				3205	{
				3206	OperandUnpacker operands(_data->tiles, _data->arguments);
				3207	auto dst = operands.unpack(o_dst);
				3208	auto step = operands.unpack(o_step);
				3209
				3210	// Validation: Check that x, y and z are scalar
				3211
				3212	TensorOperandUnpacker tensor_operands(_data->arguments);
				3213	auto tensor = tensor_operands.unpack(o_tensor);
				3214	auto gpu_sampler = o_tensor.sampler();
				3215
				3216	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3217
				3218	switch (dim)
				3219	{
				3220	case 0:
				3221	if(mapper.is_one_component_x())
				3222	{
				3223	_data->code += dst->scalar(0, 0).str;
				3224	_data->code += " = 0;\n";
				3225	}
				3226	else
				3227	{
				3228	if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				3229	{
				3230	// Validation: Check: fixed tensor shape
				3231	// TO BE CHANGED
				3232	_data->code += dst->scalar(0, 0).str;
				3233	_data->code += " = get_global_id(0) * ";
				3234	_data->code += step->scalar(0, 0).str;
				3235	_data->code += ";\n";
				3236	}
				3237	else
				3238	{
				3239	_data->code += dst->scalar(0, 0).str;
				3240	_data->code += " = get_global_id(0) * ";
				3241	_data->code += step->scalar(0, 0).str;
				3242	_data->code += ";\n";
				3243	}
				3244	}
				3245	break;
				3246	case 1:
				3247	if(mapper.is_one_component_y())
				3248	{
				3249	_data->code += dst->scalar(0, 0).str;
				3250	_data->code += " = 0;\n";
				3251	}
				3252	else
				3253	{
				3254	if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
				3255	{
				3256
				3257	}
				3258	else
				3259	{
				3260	_data->code += dst->scalar(0, 0).str;
				3261	_data->code += " = get_global_id(1) * ";
				3262	_data->code += step->scalar(0, 0).str;
				3263	_data->code += ";\n";
				3264	}
				3265	}
				3266	break;
				3267	case 2:
				3268	if(mapper.is_one_component_z())
				3269	{
				3270	_data->code += dst->scalar(0, 0).str;
				3271	_data->code += " = 0;\n";
				3272	}
				3273	else
				3274	{
				3275	_data->code += dst->scalar(0, 0).str;
				3276	_data->code += " = get_global_id(2) * ";
				3277	_data->code += step->scalar(0, 0).str;
				3278	_data->code += ";\n";
				3279	}
				3280	break;
				3281	default:
				3282	break;
				3283	}
				3284	};
				3285
				3286	void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
				3287	{
				3288	OperandUnpacker operands(_data->tiles, _data->arguments);
				3289	auto dst = operands.unpack(o_dst);
				3290
				3291	TensorOperandUnpacker tensor_operands(_data->arguments);
				3292	auto tensor = tensor_operands.unpack(o_tensor);
				3293	auto gpu_sampler = o_tensor.sampler();
				3294
				3295	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3296
				3297	if(mapper.is_one_component_batch())
				3298	{
				3299	_data->code += dst->scalar(0, 0).str;
				3300	_data->code += " = 0;\n";
				3301	}
				3302	else
				3303	{
				3304	std::cout << "Unsupported batched computation" << std::endl;
				3305	assert(false);
				3306	}
				3307	};
				3308
				3309	void op_get_global_size(const Operand& dst_var, int32_t dim) override
				3310	{
				3311	assert(dst_var.type() == OperandType::Tile);
				3312	assert(_data->tiles.has_tile(dst_var.value()));
				3313	assert(_data->tiles[dst_var.value()]->format().w == 1 &&
				3314	_data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
				3315
				3316	auto var = _data->tiles[dst_var.value()];
				3317
				3318	_data->code += var->scalar(0, 0).str;
				3319	_data->code += " = get_global_size(";
				3320	_data->code += std::to_string(dim);
				3321	_data->code += ");\n";
				3322	}
				3323
				3324	void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
				3325	{
				3326	OperandUnpacker operands(_data->tiles, _data->arguments);
				3327	auto lhs = operands.unpack(lhs_name);
				3328	auto rhs = operands.unpack(rhs_name);
				3329	auto dst = operands.unpack(dst_name);
				3330
				3331	const int32_t dst_w = dst->format().w;
				3332	const int32_t dst_h = dst->format().h;
				3333	assert(lhs != nullptr);
				3334	const int32_t lhs_w = lhs->format().w;
				3335	const int32_t rhs_w = rhs->format().w;
				3336
				3337	if(op == BinaryOp::MatMul_Nt_T)
				3338	{
				3339	assert((dst->format().dt == DataType::Fp32) \|\| (dst->format().dt == DataType::Fp16));
				3340	for(int32_t y = 0; y < dst_h; ++y)
				3341	{
				3342	for(int32_t x = 0; x < dst_w; ++x)
				3343	{
				3344	for(int32_t k = 0; k < lhs_w; ++k)
				3345	{
				3346	_data->code += dst->scalar(x, y).str;
				3347	_data->code += " = fma(";
				3348	_data->code += lhs->scalar(k, y).str;
				3349	_data->code += ", ";
				3350	_data->code += rhs->scalar(k, x).str;
				3351	_data->code += ", ";
				3352	_data->code += dst->scalar(x, y).str;
				3353	_data->code += ");\n";
				3354	}
				3355	}
				3356	}
				3357
				3358	return;
				3359	}
				3360
				3361	bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
				3362	bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
				3363
				3364	std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
				3365	std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
				3366	std::string op_str = to_string(op);
				3367
				3368	// Broadcasting on Y is automatic
				3369	for(int32_t y = 0; y < dst_h; ++y)
				3370	{
				3371	_data->code += dst->vector(y).str;
				3372	_data->code += " = ";
				3373	_data->code += lhs_prefix + lhs->vector(y).str;
				3374	_data->code += " ";
				3375	_data->code += op_str;
				3376	_data->code += " ";
				3377	_data->code += rhs_prefix + rhs->vector(y).str;
				3378	_data->code += ";\n";
				3379	}
				3380	};
				3381
				3382	void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
				3383	{
				3384	CKW_UNUSED(policy);
				3385
				3386	OperandUnpacker operands(_data->tiles, _data->arguments);
				3387	auto src = operands.unpack(o_src);
				3388	auto dst = operands.unpack(o_dst);
				3389
				3390	// const int32_t dst_w = dst->format().w;
				3391	const int32_t dst_h = dst->format().h;
				3392	const std::string dt = dst->scalar(0, 0).type.str;
				3393
				3394	// Broadcasting on Y is automatic
				3395	for(int32_t y = 0; y < dst_h; ++y)
				3396	{
				3397	_data->code += dst->vector(y).str;
				3398	_data->code += " = convert_" + dt + "(";
				3399	_data->code += src->vector(y).str;
				3400	_data->code += ");\n";
				3401	}
				3402	};
				3403
				3404	void op_assign(const Operand& dst_name, const Operand& src_name) override
				3405	{
				3406	OperandUnpacker operands(_data->tiles, _data->arguments);
				3407	auto src = operands.unpack(src_name);
				3408	auto dst = operands.unpack(dst_name);
				3409
				3410	const int32_t dst_w = dst->format().w;
				3411	const int32_t dst_h = dst->format().h;
				3412	const int32_t src_w = src->format().w;
				3413	// const int32_t src_h = src->format().h;
				3414	const std::string dt = dst->scalar(0, 0).type.str;
				3415
				3416	bool broadcast_src_x = dst_w != 1 && src_w == 1;
				3417
				3418	std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
				3419
				3420	// Broadcasting on Y is automatic
				3421	for(int32_t y = 0; y < dst_h; ++y)
				3422	{
				3423	_data->code += dst->vector(y).str;
				3424	_data->code += " = ";
				3425	_data->code += src_prefix + src->vector(y).str;
				3426	_data->code += ";\n";
				3427	}
				3428	}
				3429
				3430	void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
				3431	{
				3432	OperandUnpacker operands(_data->tiles, _data->arguments);
				3433	auto src = operands.unpack(src_name);
				3434	auto dst = operands.unpack(dst_name);
				3435
				3436	const int32_t dst_w = dst->format().w;
				3437	const int32_t dst_h = dst->format().h;
				3438	const int32_t src_w = src->format().w;
				3439	// const int32_t src_h = src->format().h;
				3440	const std::string dt = dst->scalar(0, 0).type.str;
				3441
				3442	bool broadcast_src_x = dst_w != 1 && src_w == 1;
				3443
				3444	std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
				3445
				3446	// Broadcasting on Y is automatic
				3447	for(int32_t y = 0; y < dst_h; ++y)
				3448	{
				3449	_data->code += dst->vector(y).str;
				3450	_data->code += " = ";
				3451
				3452	switch(func)
				3453	{
				3454	case ScalarUnaryFunction::Exp:
				3455	_data->code += "exp(";
				3456	break;
				3457
				3458	default:
				3459	CKW_ASSERT(false);
				3460	}
				3461
				3462	_data->code += src_prefix + src->vector(y).str;
				3463	_data->code += ");\n";
				3464	}
				3465	}
				3466
				3467	void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
				3468	{
				3469	OperandUnpacker operands(_data->tiles, _data->arguments);
				3470	auto lhs = operands.unpack(o_lhs);
				3471	auto rhs = operands.unpack(o_rhs);
				3472
				3473	assert(is_tile_scalar(lhs));
				3474	assert(is_tile_scalar(rhs));
				3475
				3476	_data->code += "if(";
				3477	_data->code += lhs->scalar(0, 0).str;
				3478	_data->code += " ";
				3479	_data->code += to_string(op);
				3480	_data->code += " ";
				3481	_data->code += rhs->scalar(0, 0).str;
				3482	_data->code += ")\n";
				3483	}
				3484
				3485	void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
				3486	{
				3487	OperandUnpacker operands(_data->tiles, _data->arguments);
				3488	auto var = operands.unpack(var_name);
				3489	auto cond_value = operands.unpack(cond_value_name);
				3490	auto update_value = operands.unpack(update_value_name);
				3491
				3492	const int32_t dst_w = var->format().w;
				3493	const int32_t dst_h = var->format().h;
				3494
				3495	// It must be a scalar variable
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	3496	CKW_UNUSED(dst_w, dst_h);
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	3497	assert(dst_w == 1);
				3498	assert(dst_h == 1);
				3499
				3500	_data->code += "for(; " ;
				3501	_data->code += var->scalar(0, 0).str;
				3502	_data->code += " ";
				3503	_data->code += to_string(cond_op);
				3504	_data->code += " " + cond_value->scalar(0, 0).str + "; ";
				3505	_data->code += var->scalar(0, 0).str;
				3506	_data->code += " ";
				3507	_data->code += to_string(update_op);
				3508	_data->code += " " + update_value->scalar(0, 0).str + ")";
				3509	_data->code += "\n";
				3510	}
				3511
				3512	void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
				3513	{
				3514	OperandUnpacker operands(_data->tiles, _data->arguments);
				3515	auto dst = operands.unpack(o_dst);
				3516	auto x = operands.unpack(o_x);
				3517	auto y = operands.unpack(o_y);
				3518	auto z = operands.unpack(o_z);
				3519	auto dil_y = operands.unpack(dilation_y);
				3520	auto b = operands.unpack(o_batch_idx);
				3521
				3522	TensorOperandUnpacker tensor_operands(_data->arguments);
				3523	auto tensor = tensor_operands.unpack(o_tensor);
				3524	auto gpu_sampler = o_tensor.sampler();
				3525
				3526	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3527
				3528	auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
				3529
				3530	// Initialize the constant part
				3531	load_writer->initialize(dst, x, z, b);
				3532
				3533	for(int i = 0; i < dst->format().h; ++i)
				3534	{
				3535	std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
				3536	if(dil_y->scalar(0, 0).str != "1")
				3537	{
				3538	coord_y += " * " + dil_y->scalar(0, 0).str;
				3539	}
				3540	load_writer->write(std::make_pair(i, coord_y));
				3541	}
				3542
				3543	load_writer->finalize();
				3544	}
				3545
				3546	void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
				3547	{
				3548	OperandUnpacker operands(_data->tiles, _data->arguments);
				3549	auto dst = operands.unpack(o_dst);
				3550	auto x = operands.unpack(o_x);
				3551	auto y_ind = operands.unpack(o_indirect_h);
				3552	auto z = operands.unpack(o_z);
				3553	auto b = operands.unpack(o_batch_idx);
				3554
				3555	TensorOperandUnpacker tensor_operands(_data->arguments);
				3556	auto tensor = tensor_operands.unpack(o_tensor);
				3557	auto gpu_sampler = o_tensor.sampler();
				3558
				3559	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3560
				3561	auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
				3562
				3563	// Initialize the constant part
				3564	load_writer->initialize(dst, x, z, b);
				3565
				3566	for(int i = 0; i < dst->format().h; ++i)
				3567	{
				3568	load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
				3569	}
				3570
				3571	load_writer->finalize();
				3572	}
				3573
				3574	void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
				3575	{
				3576	OperandUnpacker operands(_data->tiles, _data->arguments);
				3577	auto src = operands.unpack(src_name);
				3578	auto x = operands.unpack(x_name);
				3579	auto y = operands.unpack(y_name);
				3580	auto z = operands.unpack(z_name);
				3581	auto b = operands.unpack(batch_index_name);
				3582
				3583	TensorOperandUnpacker tensor_operands(_data->arguments);
				3584	auto tensor = tensor_operands.unpack(tensor_name);
				3585	auto gpu_sampler = tensor_name.sampler();
				3586
				3587	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3588
				3589	auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
				3590
				3591	// Initialize the constant part
				3592	store_writer->initialize(src, x, z, b);
				3593
				3594	int32_t tile_h = src->format().h;
				3595
				3596	for(int m0 = tile_h - 1; m0 >= 0; m0--)
				3597	{
				3598	store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
				3599	}
				3600
				3601	store_writer->finalize();
				3602	}
				3603
				3604	void op_return() override
				3605	{
				3606	_data->code += "return;\n";
				3607	}
				3608
				3609	void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
				3610	{
				3611	OperandUnpacker operands(_data->tiles, _data->arguments);
				3612	auto dst = operands.unpack(o_dst);
				3613	auto x = operands.unpack(o_x);
				3614	auto y = operands.unpack(o_y);
				3615	auto x_off = operands.unpack(o_x_off);
				3616	auto y_off = operands.unpack(o_y_off);
				3617
				3618	TensorOperandUnpacker tensor_operands(_data->arguments);
				3619	auto tensor = tensor_operands.unpack(o_tensor);
				3620
				3621	assert(dst->format().w == 1);
				3622	assert(x->format().w == 1);
				3623	assert(y->format().w == 1);
				3624	assert(x_off->format().w == 1);
				3625	assert(y_off->format().w == 1);
				3626	assert(dst->format().dt == DataType::Int32);
				3627	assert(x->format().dt == DataType::Int32);
				3628	assert(y->format().dt == DataType::Int32);
				3629	assert(x_off->format().dt == DataType::Int32);
				3630	assert(y_off->format().dt == DataType::Int32);
				3631
				3632	const std::string width = tensor->component(TensorComponent::W);
				3633	const std::string height = tensor->component(TensorComponent::H);
				3634	const std::string wxh = tensor->component(TensorComponent::WxH);
				3635	/*
				3636	int x_s;
				3637	int y_s;
				3638	x_s = (xi_0 + x_k);
				3639	y_s = (yi_0 + y_k);
				3640	mi_0 = x_s + y_s * width + b * widthxheight;
				3641	mi_0 = select(-1, mi_0, x_s >= 0);
				3642	mi_0 = select(-1, mi_0, y_s >= 0);
				3643	mi_0 = select(-1, mi_0, x_s < 128);
				3644	mi_0 = select(-1, mi_0, y_s < 128);
				3645	*/
				3646	compound_statement_begin();
				3647	declare_tile("_x_s", TileInfo(DataType::Int32));
				3648	declare_tile("_y_s", TileInfo(DataType::Int32));
				3649	auto x_s = operands.unpack(Operand("_x_s"));
				3650	auto y_s = operands.unpack(Operand("_y_s"));
				3651	for(int i = 0; i < dst->format().h; ++i)
				3652	{
				3653	// x_s = (xi_0 + x_k);
				3654	// y_s = (yi_0 + y_k);
				3655	_data->code += x_s->scalar(0, i).str;
				3656	_data->code += " = (";
				3657	_data->code += x->scalar(0, i).str;
				3658	_data->code += " + ";
				3659	_data->code += x_off->scalar(0, i).str;
				3660	_data->code += ");\n";
				3661	_data->code += y_s->scalar(0, i).str;
				3662	_data->code += " = (";
				3663	_data->code += y->scalar(0, i).str;
				3664	_data->code += " + ";
				3665	_data->code += y_off->scalar(0, i).str;
				3666	_data->code += ");\n";
				3667	// mi_0 = x_s + y_s * width;
				3668	_data->code += dst->scalar(0, i).str;
				3669	_data->code += " = ";
				3670	_data->code += x_s->scalar(0, i).str;
				3671	_data->code += " + ";
				3672	_data->code += y_s->scalar(0, i).str;
				3673	_data->code += " * " + width + ";\n";
				3674	// mi_0 = select(wxh, mi_0, x_s >= 0);
				3675	_data->code += dst->scalar(0, i).str;
				3676	_data->code += " = select(-1, ";
				3677	_data->code += dst->scalar(0, i).str;
				3678	_data->code += ", ";
				3679	_data->code += x_s->scalar(0, i).str;
				3680	_data->code += " >= 0);\n";
				3681	// mi_0 = select(wxh, mi_0, y_s >= 0);
				3682	_data->code += dst->scalar(0, i).str;
				3683	_data->code += " = select(-1, ";
				3684	_data->code += dst->scalar(0, i).str;
				3685	_data->code += ", ";
				3686	_data->code += y_s->scalar(0, i).str;
				3687	_data->code += " >= 0);\n";
				3688	// mi_0 = select(wxh, mi_0, x_s < width);
				3689	_data->code += dst->scalar(0, i).str;
				3690	_data->code += " = select(-1, ";
				3691	_data->code += dst->scalar(0, i).str;
				3692	_data->code += ", ";
				3693	_data->code += x_s->scalar(0, i).str;
				3694	_data->code += " < ";
				3695	_data->code += width + ");\n";
				3696	// mi_0 = select(wxh, mi_0, y_s < height);
				3697	_data->code += dst->scalar(0, i).str;
				3698	_data->code += " = select(-1, ";
				3699	_data->code += dst->scalar(0, i).str;
				3700	_data->code += ", ";
				3701	_data->code += y_s->scalar(0, i).str;
				3702	_data->code += " < ";
				3703	_data->code += height + ");\n";
				3704	}
				3705	compound_statement_end();
				3706	}
				3707
				3708	private:
				3709	GpuKernelWriterDataHolder* _data { nullptr };
				3710	GpuKernelWriterAttribute * _attr { nullptr };
				3711	};
				3712
				3713	/** IGpuKernelWriter factory class */
				3714	class GpuKernelWriterFactory final
				3715	{
				3716	public:
				3717	/** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
				3718	*
				3719	* @param[in] gpu GPU target
				3720	*
				3721	* @return IGpuKernelWriter
				3722	*/
				3723	static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute attr, GpuKernelWriterDataHolder x)
				3724	{
				3725	switch(x->programming_language())
				3726	{
				3727	case GpuTargetLanguage::OpenCL:
				3728	return std::make_unique<ClKernelWriter>(attr, x);
				3729	default:
				3730	std::cout << "Unsupported Gpu programming language" << std::endl;
				3731	assert(false);
Viet-Hoa Do	e1880f0	2023-06-28 10:25:35 +0100	[diff] [blame^]	3732	return nullptr;
Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame]	3733	}
				3734	}
				3735	};
				3736
				3737	inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
				3738	{
				3739	auto tensor = tensor_info_id->shape;
				3740
				3741	int32_t dim[3] = {0};
				3742
				3743	switch(tensor_format)
				3744	{
				3745	case TensorSamplerFormat::C_W_H:
				3746	dim[0] = tensor[0];
				3747	dim[1] = tensor[1];
				3748	dim[2] = tensor[2];
				3749	break;
				3750	case TensorSamplerFormat::C_WH_1:
				3751	dim[0] = tensor[0];
				3752	dim[1] = tensor[1] * tensor[2];
				3753	dim[2] = 1;
				3754	break;
				3755	default:
				3756	std::cout << "Unsupported tensor format" << std::endl;
				3757	assert(false);
				3758	break;
				3759	}
				3760
				3761	return std::min(step, dim[idx]);
				3762	}
				3763
				3764	} // namespace prototype
				3765	} // namespace ckw
				3766
				3767	#endif // CKW_SRC_PROTOTYPE_H