Blame - compute_kernel_writer/src/Prototype.h - ml/ComputeLibrary

blob: f113a0bfbc4c26628ae679d9caea8306a5342045 [file] [log] [blame]

Viet-Hoa Do	bd4f6b9	2023-05-30 09:34:32 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#ifndef CKW_SRC_PROTOTYPE_H
				26	#define CKW_SRC_PROTOTYPE_H
				27
				28	#include <vector>
				29	#include <map>
				30	#include <string>
				31	#include <cstdint> // int32_t
				32	#include <iostream> // cout (to be removed)
				33	#include <cassert> // assert (to be removed)
				34	#include <unordered_map>
				35	#include <chrono>
				36	#include <cmath>
				37	#include <memory>
				38	#include <algorithm>
				39	#include <array>
				40	#include <stdexcept>
				41
				42	#include "ckw/Types.h"
				43	#include "ckw/TensorInfo.h"
				44	#include "ckw/Error.h"
				45
				46	namespace ckw
				47	{
				48	namespace prototype {
				49
				50	// Dummy data structure for Size2D
				51	using Size2D = std::vector<int32_t>;
				52
				53	// Dummy Status
				54	using Status = void;
				55
				56	enum class ComponentType : int32_t
				57	{
				58	Complex = 0,
				59	Simple = 1,
				60	Unfusable = 2
				61	};
				62
				63	enum class GpuCompilationSpeed
				64	{
				65	Fast = 0x00, // fast compilation may increase the latency of the network
				66	Slow = 0x01 // slow compilation may decrease the latency of the network
				67	};
				68
				69	enum class GpuExtensions
				70	{
				71	Fp16,
				72	Dot8,
				73	Mmul,
				74	FastMath
				75	};
				76
				77	struct TensorInfo
				78	{
				79	TensorShape shape { {0} };
				80	DataType data_type { DataType::Unknown };
				81	TensorDataLayout data_layout { TensorDataLayout::Nhwc };
				82	int32_t id { -1 };
				83	};
				84
				85	struct ComponentAttribute
				86	{
				87	GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
				88	bool overwrite_tile { true };
				89	};
				90
				91	inline std::string data_type_to_cl_type(DataType dt)
				92	{
				93	switch(dt)
				94	{
				95	case DataType::Fp32:
				96	return "float";
				97	case DataType::Fp16:
				98	return "half";
				99	case DataType::Int8:
				100	return "char";
				101	case DataType::Uint8:
				102	return "uchar";
				103	case DataType::Uint16:
				104	return "ushort";
				105	case DataType::Int16:
				106	return "short";
				107	case DataType::Uint32:
				108	return "uint";
				109	case DataType::Int32:
				110	return "int";
				111	case DataType::Bool:
				112	return "bool";
				113	default:
				114	assert(false);
				115	}
				116	}
				117
				118	inline int32_t width_to_cl_vector_size(int32_t width)
				119	{
				120	switch(width)
				121	{
				122	case 1:
				123	return 1;
				124	case 2:
				125	return 2;
				126	case 3:
				127	return 3;
				128	case 4:
				129	return 4;
				130	case 5:
				131	case 6:
				132	case 7:
				133	case 8:
				134	return 8;
				135	case 9:
				136	case 10:
				137	case 11:
				138	case 12:
				139	case 13:
				140	case 14:
				141	case 15:
				142	case 16:
				143	return 16;
				144	default:
				145	assert(false);
				146	}
				147	}
				148
				149	inline std::string get_cl_data_type(DataType dt, int32_t width)
				150	{
				151	std::string data_type;
				152	int32_t w = width_to_cl_vector_size(width);
				153	data_type += data_type_to_cl_type(dt);
				154	if(w != 1)
				155	{
				156	data_type += std::to_string(w);
				157	}
				158	return data_type;
				159	}
				160
				161	inline std::string to_opencl_store(int32_t vector_length)
				162	{
				163	if(vector_length != 1)
				164	{
				165	return "vstore" + std::to_string(vector_length) + "(";
				166	}
				167	else
				168	{
				169	return "*(";
				170	}
				171	}
				172
				173	struct TileInfo
				174	{
				175	TileInfo() {}
				176	TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
				177	TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
				178	TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
				179	DataType dt{ DataType::Unknown }; // Data type of the tile
				180	int32_t w{ 0 }; // Width (i.e. c0 - portion of the channels)
				181	int32_t h{ 0 }; // Height (i.e. s0 - portion of the spatial dimensions)
				182	};
				183
				184	inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
				185	{
				186	o << a.w << " x " << a.h;
				187	return o;
				188	}
				189
				190	struct DataTypeAsString
				191	{
				192	std::string str { "" };
				193	DataType dt { DataType::Unknown };
				194	int32_t size { 1 };
				195	};
				196
				197	struct ValueAsString
				198	{
				199	std::string str { "" };
				200	DataTypeAsString type { };
				201	};
				202
				203	// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
				204	// A Tile is a collection of variables used to express a 2D data.
				205	class IScalarTile
				206	{
				207	public:
				208	virtual ~IScalarTile() = default;
				209	/** Method to get the scalar variable from a tile
				210	* @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				211	* @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				212	*
				213	* @return the scalar variable as a string
				214	*/
				215	virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
				216	/** Method to get the list of underlying variable names used by the tile
				217	*
				218	* @return the list of variable names
				219	*/
				220	virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
				221	/** Method to get the name of the tile.
				222	*
				223	* @return the name of the tile
				224	*/
				225	std::string name() const
				226	{
				227	return _basename;
				228	}
				229	/** Method to get the tile format
				230	*
				231	* @return the format
				232	*/
				233	TileInfo format() const
				234	{
				235	return _format;
				236	}
				237	/** Method to know whether the tile is assignable or not (constant)
				238	*
				239	* @return true if the tile is assignable
				240	*/
				241	virtual bool is_assignable() const = 0;
				242	/** Method to know whether the tile needs to be declared
				243	*
				244	* @return true if the tile needs to be declared in the code before being used
				245	*/
				246	virtual bool need_declaration() const = 0;
				247	protected:
				248	TileInfo _format { }; // Tile format
				249	std::string _basename { "" }; // Tile name
				250	};
				251
				252	// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
				253	// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
				254	class IVectorTile : public IScalarTile
				255	{
				256	public:
				257	virtual ~IVectorTile() = default;
				258	/** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
				259	* The user can query the list of supported width for the vectors through preferred_vector_sizes().
				260	*
				261	* @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
				262	*
				263	* @return the vector variable as a string
				264	*/
				265	virtual ValueAsString vector(int32_t y) const = 0;
				266	/** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
				267	*
				268	* @return the vector variable as a string
				269	*/
				270	virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
				271	/** Method to get the preferred vector sizes.
				272	*
				273	* @return a vector with the preferred vector sizes
				274	*/
				275	//virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
				276	};
				277
				278	class ClTile : public IVectorTile
				279	{
				280	public:
				281	ClTile(const std::string& name, TileInfo format)
				282	{
				283	_format = format;
				284	_basename = name;
				285	}
				286
				287	ValueAsString scalar(int32_t x, int32_t y) const override
				288	{
				289	x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
				290	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				291
				292	ValueAsString t;
				293	t.str = build_variable_name(y);
				294	t.type.str = get_cl_data_type(_format.dt, 1);
				295	t.type.dt = _format.dt;
				296	t.type.size = 1;
				297
				298	// Check required because if the width has only one element, we cannot use .s0
				299	if(_format.w != 1)
				300	{
				301	// Automatic broadcasting
				302	t.str += ".s" + std::to_string(x);
				303	}
				304
				305	return t;
				306	}
				307
				308	ValueAsString vector(int32_t y) const override
				309	{
				310	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				311
				312	ValueAsString t;
				313	t.str = build_variable_name(y);
				314	t.type.str = get_cl_data_type(_format.dt, _format.w);
				315	t.type.dt = _format.dt;
				316	t.type.size = _format.w;
				317	return t;
				318	}
				319
				320	ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
				321	{
				322	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				323
				324	ValueAsString t;
				325	t.str = build_variable_name(y);
				326	t.type.str = get_cl_data_type(_format.dt, width);
				327	t.type.dt = _format.dt;
				328	t.type.size = width;
				329
				330	if(_format.w != 1)
				331	{
				332	t.str += ".s";
				333	for(int i = 0; i < width; ++i)
				334	{
				335	t.str += to_scalar_hex(x_start + i);
				336	}
				337	}
				338	return t;
				339	}
				340
				341	std::vector<ValueAsString> underlying_source_variables() const override
				342	{
				343	std::vector<ValueAsString> vars;
				344	for(int32_t y = 0; y < _format.h; ++y)
				345	{
				346	ValueAsString t;
				347	t.str = build_variable_name(y);
				348	t.type.str = get_cl_data_type(_format.dt, _format.w);
				349	t.type.dt = _format.dt;
				350	t.type.size = _format.w;
				351	vars.push_back(t);
				352	}
				353	return vars;
				354	}
				355
				356	bool is_assignable() const override
				357	{
				358	return true;
				359	}
				360
				361	bool need_declaration() const override
				362	{
				363	return true;
				364	}
				365
				366	private:
				367	std::string build_variable_name(int32_t y) const
				368	{
				369	std::string var_name = _basename;
				370
				371	if(_format.h == 1)
				372	{
				373	return var_name;
				374
				375	}
				376	else
				377	{
				378	var_name += "_";
				379	var_name += std::to_string(y);
				380	}
				381
				382	return var_name;
				383	}
				384
				385	std::string to_scalar_hex(int32_t x) const
				386	{
				387	switch(x)
				388	{
				389	case 0:
				390	case 1:
				391	case 2:
				392	case 3:
				393	case 4:
				394	case 5:
				395	case 6:
				396	case 7:
				397	case 8:
				398	case 9:
				399	return std::to_string(x);
				400	case 10:
				401	return "A";
				402	case 11:
				403	return "B";
				404	case 12:
				405	return "C";
				406	case 13:
				407	return "D";
				408	case 14:
				409	return "E";
				410	case 15:
				411	return "F";
				412	default:
				413	std::cout << "Unsupported hexadecimal value" << std::endl;
				414	assert(false);
				415	}
				416	}
				417	};
				418
				419	// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
				420	class ClConstantTile : public IVectorTile
				421	{
				422	public:
				423	ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
				424	{
				425	_format.w = in[0].size();
				426	_format.h = in.size();
				427	_format.dt = dt;
				428
				429	_data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
				430
				431	for(int32_t y = 0; y < _format.h; ++y)
				432	{
				433	for(int32_t x = 0; x < _format.w; ++x)
				434	{
				435	_data[y][x] = in[y][x];
				436	}
				437	}
				438	}
				439
				440	ValueAsString scalar(int32_t x, int32_t y) const override
				441	{
				442	x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
				443	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				444
				445	ValueAsString t;
				446	t.str = _data[y][x];
				447	t.type.str = get_cl_data_type(_format.dt, 1);
				448	t.type.dt = _format.dt;
				449	t.type.size = 1;
				450
				451	return t;
				452	}
				453
				454	ValueAsString vector(int32_t y) const override
				455	{
				456	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				457
				458	return vector(0, _format.w, y);
				459	}
				460
				461	ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
				462	{
				463	y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
				464
				465	ValueAsString t;
				466	t.str = "";
				467	t.type.str = get_cl_data_type(_format.dt, width);
				468	t.type.dt = _format.dt;
				469	t.type.size = width;
				470
				471	if(width > 1)
				472	{
				473	t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
				474	}
				475
				476	int32_t x = x_start;
				477	for(; x < width - 1; ++x)
				478	{
				479	t.str += scalar(x, y).str;
				480	t.str += ", ";
				481	}
				482	t.str += scalar(x, y).str;
				483
				484	if(width > 1)
				485	{
				486	t.str += "))";
				487	}
				488
				489	return t;
				490	}
				491
				492	std::vector<ValueAsString> underlying_source_variables() const override
				493	{
				494	std::vector<ValueAsString> vars;
				495
				496	for(int32_t y = 0; y < _format.h; ++y)
				497	{
				498	for(int32_t x = 0; x < _format.w; ++x)
				499	{
				500	ValueAsString t;
				501	t.str = _data[y][x];
				502	t.type.str = get_cl_data_type(_format.dt, 1);
				503	t.type.dt = _format.dt;
				504	t.type.size = 1;
				505	vars.push_back(t);
				506	}
				507	}
				508
				509	return vars;
				510	}
				511
				512	bool is_assignable() const override
				513	{
				514	return false;
				515	}
				516
				517	bool need_declaration() const override
				518	{
				519	return false;
				520	}
				521
				522	private:
				523	std::vector<std::vector<std::string>> _data{};
				524	};
				525
				526	enum class TensorComponentIndex : int32_t
				527	{
				528	IndexMask = 0x0000000f,
				529	};
				530
				531	enum class TensorComponentType : int32_t
				532	{
				533	OffsetFirstElement = 0x00000100,
				534	Stride = 0x00001000,
				535	Dimension = 0x00010000,
				536	FoldedDimension = 0x00100000,
				537	Constant = 0x01000000
				538	};
				539
				540	enum class TensorComponent : int32_t
				541	{
				542	Unknown = 0x00000000,
				543	OffsetFirstElement = 0x00000100,
				544	Stride1 = 0x00001001,
				545	Stride2 = 0x00001002,
				546	Stride3 = 0x00001003,
				547	Stride4 = 0x00001004,
				548	Dim0 = 0x00010000,
				549	Dim1 = 0x00010001,
				550	Dim2 = 0x00010002,
				551	Dim3 = 0x00010003,
				552	Dim4 = 0x00010004,
				553	C = 0x00010000, // Dim0
				554	W = 0x00010001, // Dim1
				555	H = 0x00010002, // Dim2
				556	D = 0x00010003,
				557	N = 0x00010004,
				558	Dim1xDim2 = 0x00100021,
				559	Dim1xDim2xDim3 = 0x00100321,
				560	WxH = 0x00100021,
				561	WxHxD = 0x00100321
				562	};
				563
				564	inline std::string to_string(TensorComponent x)
				565	{
				566	switch(x)
				567	{
				568	case TensorComponent::Unknown:
				569	return "Unknown";
				570	case TensorComponent::OffsetFirstElement:
				571	return "OffsetFirstElement";
				572	case TensorComponent::Stride1:
				573	return "Stride1";
				574	case TensorComponent::Stride2:
				575	return "Stride2";
				576	case TensorComponent::Stride3:
				577	return "Stride3";
				578	case TensorComponent::Stride4:
				579	return "Stride4";
				580	case TensorComponent::Dim0:
				581	return "Dim0";
				582	case TensorComponent::Dim1:
				583	return "Dim1";
				584	case TensorComponent::Dim2:
				585	return "Dim2";
				586	case TensorComponent::Dim3:
				587	return "Dim3";
				588	case TensorComponent::Dim4:
				589	return "Dim4";
				590	case TensorComponent::Dim1xDim2:
				591	return "Dim1xDim2";
				592	case TensorComponent::Dim1xDim2xDim3:
				593	return "Dim1xDim2xDim3";
				594	default:
				595	assert(false);
				596	}
				597	}
				598
				599	class ITensorArgument
				600	{
				601	public:
				602	virtual ~ITensorArgument() = default;
				603	/** Method to get the tensor component as a string
				604	*
				605	* @param[in] x tensor component to query
				606	*
				607	* @return the tensor component as a string
				608	*/
				609	virtual std::string component(TensorComponent x) = 0;
				610	/** Method to get the tensor component type declaration as a string
				611	*
				612	* @return the tensor component type declaration as a string
				613	*/
				614	virtual std::string component_type_declaration() const = 0;
				615	/** Method to get the tensor component data type
				616	*
				617	* @return the tensor component data type
				618	*/
				619	virtual DataType component_data_type() const = 0;
				620	/** Method to get the tensor component declarations
				621	*
				622	* @return a vector containing the tensor component declarations
				623	*/
				624	virtual std::vector<TensorComponent> component_declarations() const = 0;
				625	/** Method to get the name of the tensor argument.
				626	*
				627	* @return the name of the tensor argument
				628	*/
				629	std::string name() const
				630	{
				631	return _basename;
				632	}
				633	/** Method to get the tensor format
				634	*
				635	* @return the format
				636	*/
				637	TensorInfo format() const
				638	{
				639	return _format;
				640	}
				641
				642	protected:
				643	TensorInfo _format { };
				644	std::string _basename {};
				645	};
				646
				647	enum class GpuTensorStorage : int32_t
				648	{
				649	Unknown = 0x0000,
				650	BufferUint8Ptr = 0x0012,
				651	Image2dReadOnly = 0x0020,
				652	Image2dWriteOnly = 0x0021,
				653	Image3dReadOnly = 0x0030,
				654	Image3dWriteOnly = 0x0031
				655	};
				656
				657	class IGpuTensorArgument : public ITensorArgument
				658	{
				659	public:
				660	virtual ~IGpuTensorArgument() = default;
				661	/** Method to get the tensor storage, which is the underlying storage used to keep the data memory
				662	*
				663	* @param[in] x tensor storage to query
				664	*
				665	* @return the tensor storage as a string
				666	*/
				667	virtual std::string storage(GpuTensorStorage x) = 0;
				668	/** Method to get the tensor storage type declaration as a string
				669	*
				670	* @param[in] x tensor component to query
				671	*
				672	* @return the tensor storage type declaration as a string
				673	*/
				674	virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
				675	/** Method to get the tensor storage declarations
				676	*
				677	* @return a vector containing the tensor storage declarations
				678	*/
				679	virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
				680	};
				681
				682	class ClTensorArgument : public IGpuTensorArgument
				683	{
				684	public:
				685	ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
				686	{
				687	_basename = name;
				688	_format = x;
				689	_return_by_value_when_possible = return_by_value_when_possible;
				690	}
				691
				692	// Methods to override
				693	std::string component(TensorComponent x) override
				694	{
				695	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
				696	{
				697	int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
				698	return std::to_string(idx - 1);
				699	}
				700
				701	if(_return_by_value_when_possible)
				702	{
				703	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
				704	{
				705	int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
				706	return std::to_string(_format.shape[idx]);
				707	}
				708
				709	if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
				710	{
				711	switch(x)
				712	{
				713	case TensorComponent::Dim1xDim2:
				714	return std::to_string(_format.shape[1] * _format.shape[2]);
				715	case TensorComponent::Dim1xDim2xDim3:
				716	return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
				717	default:
				718	std::cout << "Unsupported folded dimension" << std::endl;
				719	assert(false);
				720	}
				721	}
				722	}
				723
				724	if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
				725	{
				726	_components_required.push_back(x);
				727	}
				728
				729	return build_component_name(x);
				730	}
				731
				732	std::string component_type_declaration() const override
				733	{
				734	return "int";
				735	};
				736
				737	DataType component_data_type() const override
				738	{
				739	return DataType::Int32;
				740	}
				741
				742	std::string storage(GpuTensorStorage x) override
				743	{
				744	if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
				745	{
				746	_storage_required.push_back(x);
				747	}
				748
				749	return build_storage_name(x);
				750	}
				751
				752	std::string storage_type_declaration(GpuTensorStorage x) const override
				753	{
				754	switch(x)
				755	{
				756	case GpuTensorStorage::BufferUint8Ptr:
				757	return "__global uchar*";
				758	case GpuTensorStorage::Image2dReadOnly:
				759	return "__read_only image2d_t";
				760	case GpuTensorStorage::Image2dWriteOnly:
				761	return "__write_only image2d_t";
				762	case GpuTensorStorage::Image3dReadOnly:
				763	return "__read_only image3d_t ";
				764	case GpuTensorStorage::Image3dWriteOnly:
				765	return "__write_only image3d_t ";
				766	default:
				767	std::cout << "Unsupported storage" << std::endl;
				768	assert(false);
				769	}
				770	};
				771
				772	std::vector<GpuTensorStorage> storage_declarations() const override
				773	{
				774	return _storage_required;
				775	}
				776
				777	std::vector<TensorComponent> component_declarations() const override
				778	{
				779	return _components_required;
				780	}
				781
				782	private:
				783	std::string build_storage_name(GpuTensorStorage x) const
				784	{
				785	std::string var_name = _basename;
				786
				787	switch(x)
				788	{
				789	case GpuTensorStorage::BufferUint8Ptr:
				790	return var_name + "_ptr";
				791	case GpuTensorStorage::Image2dReadOnly:
				792	case GpuTensorStorage::Image2dWriteOnly:
				793	return var_name + "_img2d";
				794	case GpuTensorStorage::Image3dReadOnly:
				795	case GpuTensorStorage::Image3dWriteOnly:
				796	return var_name + "_img3d";
				797	default:
				798	std::cout << "Unsupported storage" << std::endl;
				799	assert(false);
				800	}
				801
				802	return var_name;
				803	}
				804
				805	std::string build_component_name(TensorComponent x) const
				806	{
				807	std::string var_name = _basename;
				808
				809	switch(x)
				810	{
				811	case TensorComponent::OffsetFirstElement:
				812	return var_name + "_offset_first_element";
				813	case TensorComponent::Stride1:
				814	return var_name + "_stride1";
				815	case TensorComponent::Stride2:
				816	return var_name + "_stride2";
				817	case TensorComponent::Stride3:
				818	return var_name + "_stride3";
				819	case TensorComponent::Dim0:
				820	return var_name + "_dim0";
				821	case TensorComponent::Dim1:
				822	return var_name + "_dim1";
				823	case TensorComponent::Dim2:
				824	return var_name + "_dim2";
				825	case TensorComponent::Dim3:
				826	return var_name + "_dim3";
				827	case TensorComponent::Dim1xDim2:
				828	return var_name + "_dim1xdim2";
				829	case TensorComponent::Dim1xDim2xDim3:
				830	return var_name + "_dim1xdim2xdim3";
				831	default:
				832	std::cout << "Unsupported component" << std::endl;
				833	assert(false);
				834	}
				835
				836	return var_name;
				837	}
				838
				839	bool _return_by_value_when_possible { false };
				840	std::vector<GpuTensorStorage> _storage_required {};
				841	std::vector<TensorComponent> _components_required {};
				842	};
				843
				844	/**
				845	* @brief Data structure that contains the declared tiles by the components.
				846	* The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
				847	* increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
				848	* and remove (pop) all the tiles from the level above.
				849	* When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
				850	* Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
				851	* when declaring tiles among different components.
				852	*
				853	*/
				854	class GpuTileRegistry
				855	{
				856	public:
				857	enum class RegistryTileType
				858	{
				859	Tile,
				860	Link
				861	};
				862
				863	using RegistryIdSpace = int32_t;
				864	using RegistryLevel = int32_t;
				865	using RegistryTileName = std::string;
				866
				867	struct RegistryTileTableEntry
				868	{
				869	RegistryLevel registry_level { 0 };
				870	std::unique_ptr<IVectorTile> tile_object { nullptr };
				871	};
				872
				873	struct RegistryTileTypeTableEntry
				874	{
				875	RegistryTileType tile_type { RegistryTileType::Tile };
				876	RegistryTileName tile_name {};
				877	RegistryIdSpace registry_idspace { 0 };
				878	RegistryLevel registry_level { 0 };
				879	};
				880
				881	using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
				882	using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
				883	/**
				884	* @brief Construct a new Gpu Tile Registry object
				885	*
				886	*/
				887	GpuTileRegistry()
				888	{
				889	_language = GpuTargetLanguage::Unknown;
				890	}
				891	/**
				892	* @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
				893	*
				894	* @param[in] language Gpu programming language to use
				895	*/
				896	GpuTileRegistry(GpuTargetLanguage language)
				897	{
				898	_language = language;
				899	}
				900	/**
				901	* @brief Default destructor. Destroy the Gpu Tile Registry object
				902	*
				903	*/
				904	~GpuTileRegistry() = default;
				905	/**
				906	* @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
				907	* Therefore, the IdSpace should be set before declaring any tiles.
				908	*
				909	* @param[in] id The IdSpace id
				910	*/
				911	void set_IdSpace(int32_t id)
				912	{
				913	_IdSpace = id;
				914	}
				915	/**
				916	* @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
				917	*
				918	* @return The IdSpace id
				919	*/
				920	int32_t IdSpace() const
				921	{
				922	return _IdSpace;
				923	}
				924	/**
				925	* @brief Gets all the IdSpace declarations defined in the tile registry.
				926	*
				927	* @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
				928	*/
				929	std::vector<int32_t> IdSpace_declarations() const
				930	{
				931	std::vector<int32_t> x;
				932
				933	auto it = _frags.begin();
				934
				935	while (it != _frags.end())
				936	{
				937	x.push_back(it->first);
				938
				939	it++;
				940	}
				941
				942	return x;
				943	}
				944	/**
				945	* @brief Declare a tile from a previously created tile
				946	*/
				947	void insert(const std::string& name, const IVectorTile *frag)
				948	{
				949	assert(_language == GpuTargetLanguage::OpenCL);
				950	const int32_t key_IdSpace = _IdSpace;
				951	const std::string key_var_name = name;
				952	const std::string var_name = frag->name();
				953	TileInfo format = frag->format();
				954
				955	// First check whether a tile with the same name exists
				956	IVectorTile result = (this)[key_var_name];
				957	assert(result == nullptr);
				958	if(result == nullptr)
				959	{
				960	std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
				961
				962	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				963	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				964
				965	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link;
				966	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				967	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				968	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				969	}
				970	}
				971	/**
				972	* @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
				973	*
				974	* @note The reference name used for declaring the tile should not be previously used in the IdSpace
				975	*
				976	* @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
				977	* @param[in] format Tile format use to use
				978	*/
				979	void insert(const std::string& name, const TileInfo& format)
				980	{
				981	assert(_language == GpuTargetLanguage::OpenCL);
				982	const int32_t key_IdSpace = _IdSpace;
				983	const std::string key_var_name = name;
				984	const std::string var_name = generate_tile_name(name);
				985
				986	// First check whether a tile with the same name exists
				987	IVectorTile result = (this)[key_var_name];
				988	assert(result == nullptr);
				989	if(result == nullptr)
				990	{
				991	std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
				992	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				993	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				994
				995	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				996	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				997	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				998	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				999	}
				1000	}
				1001	/**
				1002	* @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
				1003	*
				1004	* @note The reference name used for declaring the tile should not be previously used in the IdSpace
				1005	*
				1006	* @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
				1007	* @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
				1008	* @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
				1009	* that the data type is aligned with the content of the std::string.
				1010	*/
				1011	void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
				1012	{
				1013	assert(_language == GpuTargetLanguage::OpenCL);
				1014	const int32_t key_IdSpace = _IdSpace;
				1015	const std::string key_var_name = name;
				1016
				1017	// First check whether a tile with the same name exists
				1018	IVectorTile result = (this)[key_var_name];
				1019	assert(result == nullptr);
				1020	if(result == nullptr)
				1021	{
				1022	std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
				1023	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				1024	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				1025
				1026	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				1027	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				1028	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				1029	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				1030	}
				1031	}
				1032	/**
				1033	* @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
				1034	*
				1035	* @note This method can be used to declare temporary tiles that need to be accessed only once.
				1036	*
				1037	* @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
				1038	* @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
				1039	* that the data type is aligned with what passed with the std::string.
				1040	*
				1041	* @return IVectorTile* the anonymous constant tile
				1042	*/
				1043	IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
				1044	{
				1045	assert(_language == GpuTargetLanguage::OpenCL);
				1046	const int32_t key_IdSpace = _IdSpace;
				1047	const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
				1048
				1049	// First check whether a tile with the same name exists
				1050	IVectorTile result = (this)[key_var_name];
				1051	assert(result == nullptr);
				1052	if(result == nullptr)
				1053	{
				1054	std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
				1055	_frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
				1056	_frags[key_IdSpace][key_var_name].registry_level = _registry_level;
				1057
				1058	_frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
				1059	_frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
				1060	_frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
				1061	_frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
				1062	}
				1063
				1064	return (*this)[key_var_name];
				1065	}
				1066	/**
				1067	* @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
				1068	*
				1069	* @param[in] name The name of the tile to retrieve
				1070	* @param[in] IdSpace The IdSpace id where to search the tile
				1071	*
				1072	* @return IVectorTile* The tile
				1073	*/
				1074	IVectorTile* get(const std::string& name, int32_t IdSpace)
				1075	{
				1076	const int32_t key_IdSpace = IdSpace;
				1077	const std::string key_var_name = name;
				1078
				1079	IVectorTile* result = nullptr;
				1080	auto search_IdSpace = _frags.find(key_IdSpace);
				1081	if(search_IdSpace != _frags.end())
				1082	{
				1083	auto search_tile = _frags[key_IdSpace].find(key_var_name);
				1084	if(search_tile != _frags[key_IdSpace].end())
				1085	{
				1086	result = search_tile->second.tile_object.get();
				1087	assert(result != nullptr);
				1088	}
				1089	}
				1090
				1091	return result;
				1092	}
				1093	/**
				1094	* @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
				1095	*
				1096	* @param[in] name The name of the tile to retrieve
				1097	*
				1098	* @return IVectorTile* The tile
				1099	*/
				1100	IVectorTile* operator[](const std::string& name)
				1101	{
				1102	return get(name, _IdSpace);
				1103	}
				1104	/**
				1105	* @brief Check whether the tile in the in the IdSpace provided by the user exists
				1106	*
				1107	* @param[in] name Name of the tile to search for
				1108	* @param[in] IdSpace The IdSpace id where to search the tile
				1109	*
				1110	* @return true if the tile exists
				1111	* @return false if the tile does not exist
				1112	*/
				1113	bool has_tile(const std::string& name, int32_t IdSpace) const
				1114	{
				1115	const int32_t key_IdSpace = IdSpace;
				1116	const std::string key_var_name = name;
				1117
				1118	// IVectorTile* result = nullptr;
				1119	auto search_IdSpace = _frags.find(key_IdSpace);
				1120
				1121	return search_IdSpace != _frags.end();
				1122	}
				1123	/**
				1124	* @brief Check whether the tile within the current IdSpace exists
				1125	*
				1126	* @param[in] name Name of the tile to search for
				1127	*
				1128	* @return true if the tile exists
				1129	* @return false if the tile does not exist
				1130	*/
				1131	bool has_tile(const std::string& name) const
				1132	{
				1133	return has_tile(name, _IdSpace);
				1134	}
				1135	/**
				1136	* @brief Get all the tiles declared within the IdSpace provided by the user
				1137	*
				1138	* @param[in] IdSpace IdSpace where to retrieve all the declared tiles
				1139	*
				1140	* @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
				1141	*/
				1142	std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
				1143	{
				1144	std::vector<IVectorTile*> tiles;
				1145
				1146	std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
				1147
				1148	while (it != _frag_types[IdSpace].end())
				1149	{
				1150	// The following line should be enabled. However, we cannot at this stage
				1151	// because it used to retrieve the output tile produced by each component.
				1152	// However, this method should NOT be used to retrieve the output tile
				1153	//if(it->second.tile_type == RegistryTileType::Tile)
				1154	{
				1155	tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
				1156	}
				1157	it++;
				1158	}
				1159
				1160	return tiles;
				1161	}
				1162	/**
				1163	* @brief Increase the level of stack.
				1164	*
				1165	*/
				1166	void increment_registry_level()
				1167	{
				1168	_registry_level++;
				1169	}
				1170	/**
				1171	* @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
				1172	*
				1173	*/
				1174	void decrement_registry_level()
				1175	{
				1176	assert(_registry_level >= 0);
				1177
				1178	// Remove all variables in the local scope
				1179	std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
				1180
				1181	while (it != _frags[_IdSpace].end())
				1182	{
				1183	if (it->second.registry_level == _registry_level)
				1184	{
				1185	it = _frags[_IdSpace].erase(it);
				1186	}
				1187	else
				1188	{
				1189	it++;
				1190	}
				1191	}
				1192
				1193	std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
				1194
				1195	while (it_type != _frag_types[_IdSpace].end())
				1196	{
				1197	if (it_type->second.registry_level == _registry_level)
				1198	{
				1199	it_type = _frag_types[_IdSpace].erase(it_type);
				1200	}
				1201	else
				1202	{
				1203	it_type++;
				1204	}
				1205	}
				1206
				1207	_registry_level--;
				1208	}
				1209	/**
				1210	* @brief Get the level of the stack
				1211	*
				1212	*/
				1213	int32_t level() const
				1214	{
				1215	return _registry_level;
				1216	}
				1217
				1218	private:
				1219	// This method ensures that the key is unique among different components
				1220	std::string generate_tile_name(const std::string& name)
				1221	{
				1222	assert(_IdSpace >= 0 );
				1223	if(_registry_level == 0)
				1224	{
				1225	return "_G" + std::to_string(_IdSpace) + "_" + name;
				1226	}
				1227	else
				1228	{
				1229	return name;
				1230	}
				1231	}
				1232	RegistryTileTable _frags {};
				1233	RegistryTileTypeTable _frag_types {};
				1234	RegistryLevel _registry_level { 0 };
				1235	RegistryIdSpace _IdSpace { -1 };
				1236	int32_t _anonymous_frag_count { 0 }; // Counter used to create the anonymous tiles
				1237	GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
				1238	};
				1239
				1240	using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
				1241
				1242	/**
				1243	* @brief Data structure that contains the tensors consumed by the components.
				1244	* Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
				1245	* when declaring tensors among different components.
				1246	*
				1247	*/
				1248	class GpuTensorArgumentRegistry
				1249	{
				1250	public:
				1251	/**
				1252	* @brief Construct a new Gpu Tensor Registry object
				1253	*
				1254	*/
				1255	GpuTensorArgumentRegistry()
				1256	{
				1257	_language = GpuTargetLanguage::Unknown;
				1258	}
				1259	/**
				1260	* @brief Construct a new Gpu Tensor Registry object
				1261	*
				1262	* @param[in] language Gpu programming language to use
				1263	*/
				1264	GpuTensorArgumentRegistry(GpuTargetLanguage language)
				1265	{
				1266	_language = language;
				1267	}
				1268	/**
				1269	* @brief Default destructor. Destroy the Gpu Tensor Registry object
				1270	*
				1271	*/
				1272	~GpuTensorArgumentRegistry() = default;
				1273	/**
				1274	* @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
				1275	* Therefore, the IdSpace should be set before declaring any tensors.
				1276	*
				1277	* @param[in] id The IdSpace id
				1278	*/
				1279	void set_IdSpace(int32_t id)
				1280	{
				1281	_IdSpace = id;
				1282	}
				1283	/**
				1284	* @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
				1285	*
				1286	* @return The IdSpace id
				1287	*/
				1288	int32_t IdSpace() const
				1289	{
				1290	return _IdSpace;
				1291	}
				1292	/**
				1293	* @brief Gets all the IdSpace declarations defined in the tensor registry.
				1294	*
				1295	* @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
				1296	*/
				1297	std::vector<int32_t> IdSpace_declarations() const
				1298	{
				1299	std::vector<int32_t> x;
				1300
				1301	auto it = _refs.begin();
				1302
				1303	while (it != _refs.end())
				1304	{
				1305	x.push_back(it->first);
				1306
				1307	it++;
				1308	}
				1309
				1310	return x;
				1311	}
				1312	/**
				1313	* @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
				1314	*
				1315	* @note The reference name used for declaring the tensor should not be previously used in the IdSpace
				1316	*
				1317	* @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
				1318	* @param[in] x Pair of tensor info and tensor id
				1319	* @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
				1320	*/
				1321	void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
				1322	{
				1323	assert(_language == GpuTargetLanguage::OpenCL);
				1324	const int32_t key_IdSpace = _IdSpace;
				1325	const int32_t tensor_id = x.id;
				1326	const std::string key_var_name = name;
				1327	const std::string var_name = generate_tensor_name(name, tensor_id);
				1328
				1329	// First, check whether the tensor has already a reference. If so, trigger an assert
				1330	assert(!has_tensor_argument(name));
				1331
				1332	// Check whether a tensor with that tensorID exists
				1333	auto result = _tensor_arguments.find(tensor_id);
				1334	if(result == _tensor_arguments.end())
				1335	{
				1336	// It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
				1337	std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
				1338	_tensor_arguments[tensor_id] = std::move(arg);
				1339	}
				1340
				1341	_refs[key_IdSpace][key_var_name] = tensor_id;
				1342	}
				1343	/**
				1344	* @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
				1345	*
				1346	* @param[in] name The name of the tensor to retrieve
				1347	*
				1348	* @return IGpuTensor* The tensor
				1349	*/
				1350	IGpuTensorArgument* operator[](const std::string& name)
				1351	{
				1352	const int32_t key_IdSpace = _IdSpace;
				1353	const std::string key_var_name = name;
				1354
				1355	IGpuTensorArgument* result = nullptr;
				1356	auto search_IdSpace = _refs.find(key_IdSpace);
				1357	if(search_IdSpace != _refs.end())
				1358	{
				1359	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1360
				1361	if(search_tensor_id != _refs[key_IdSpace].end())
				1362	{
				1363	const int32_t tensor_id = search_tensor_id->second;
				1364	auto search_tensor_argument = _tensor_arguments.find(tensor_id);
				1365	if(search_tensor_argument != _tensor_arguments.end())
				1366	{
				1367	result = search_tensor_argument->second.get();
				1368	}
				1369	assert(result != nullptr);
				1370	}
				1371	}
				1372
				1373	return result;
				1374	}
				1375	/**
				1376	* @brief Get all the tensors declared in the IdSpace provided by the user
				1377	*
				1378	* @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
				1379	*/
				1380	std::vector<IGpuTensorArgument*> tensor_argument_declarations()
				1381	{
				1382	std::vector<IGpuTensorArgument*> args;
				1383
				1384	auto it = _tensor_arguments.begin();
				1385
				1386	while (it != _tensor_arguments.end())
				1387	{
				1388	args.push_back(it->second.get());
				1389	it++;
				1390	}
				1391
				1392	return args;
				1393	}
				1394	/**
				1395	* @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
				1396	*
				1397	* @param[in] name Name of the tensor argument to search for
				1398	*
				1399	* @return true if the tensor argument exists
				1400	* @return false if the tensor argument does not exist
				1401	*/
				1402	bool has_tensor_argument(const std::string& name)
				1403	{
				1404	const int32_t key_IdSpace = _IdSpace;
				1405	const std::string key_var_name = name;
				1406
				1407	auto search_IdSpace = _refs.find(key_IdSpace);
				1408
				1409	if(search_IdSpace != _refs.end())
				1410	{
				1411	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1412
				1413	return search_tensor_id != _refs[key_IdSpace].end();
				1414	}
				1415	else
				1416	{
				1417	return false;
				1418	}
				1419	}
				1420	/**
				1421	* @brief Check whether the tensor argument is in the the IdSpace provided by the user
				1422	*
				1423	* @param[in] name Name of the tensor argument to search for
				1424	* @param[in] IdSpace The IdSpace id where to search the tensor argument
				1425	*
				1426	* @return true if the tile exists
				1427	* @return false if the tile does not exist
				1428	*/
				1429	bool has_tensor_argument(const std::string& name, int32_t IdSpace)
				1430	{
				1431	const int32_t key_IdSpace = IdSpace;
				1432	const std::string key_var_name = name;
				1433
				1434	auto search_IdSpace = _refs.find(key_IdSpace);
				1435
				1436	if(search_IdSpace != _refs.end())
				1437	{
				1438	auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
				1439
				1440	return search_tensor_id != _refs[key_IdSpace].end();
				1441	}
				1442	else
				1443	{
				1444	return false;
				1445	}
				1446	}
				1447	private:
				1448	// This method ensures that the key is unique among different components
				1449	std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
				1450	{
				1451	assert(tensor_id >= 0 );
				1452
				1453	return name + std::to_string(tensor_id);
				1454	}
				1455
				1456	std::map<int32_t, TensorEntry> _tensor_arguments {};
				1457	std::map<int32_t, std::map<std::string, int32_t>> _refs {};
				1458	int32_t _IdSpace { -1 };
				1459	GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
				1460	};
				1461
				1462	enum class OpType : int32_t
				1463	{
				1464	Elementwise = 0x0000,
				1465	Relational = 0x1000,
				1466	Algebra = 0x2000
				1467	};
				1468
				1469	inline std::string to_string(AssignmentOp op)
				1470	{
				1471	switch(op)
				1472	{
				1473	case AssignmentOp::Decrement:
				1474	return "-=";
				1475	case AssignmentOp::Increment:
				1476	return "+=";
				1477
				1478	default:
				1479	assert(false);
				1480	}
				1481	}
				1482
				1483	inline std::string to_string(BinaryOp op)
				1484	{
				1485	switch(op)
				1486	{
				1487	case BinaryOp::Add:
				1488	return "+";
				1489	case BinaryOp::Sub:
				1490	return "-";
				1491	case BinaryOp::Mul:
				1492	return "*";
				1493	case BinaryOp::Div:
				1494	return "/";
				1495	case BinaryOp::Mod:
				1496	return "%";
				1497	case BinaryOp::Equal:
				1498	return "==";
				1499	case BinaryOp::Less:
				1500	return "<";
				1501	case BinaryOp::LessEqual:
				1502	return "<=";
				1503	case BinaryOp::Greater:
				1504	return ">";
				1505	case BinaryOp::GreaterEqual:
				1506	return ">=";
				1507	case BinaryOp::LogicalAnd:
				1508	return "&&";
				1509	case BinaryOp::LogicalOr:
				1510	return "\|\|";
				1511	case BinaryOp::LogicalNot:
				1512	return "!";
				1513	default:
				1514	assert(false);
				1515	}
				1516	}
				1517
				1518	inline std::string binary_op_string(BinaryOp op)
				1519	{
				1520	switch(op)
				1521	{
				1522	case BinaryOp::Add:
				1523	return "add";
				1524	case BinaryOp::Sub:
				1525	return "sub";
				1526	case BinaryOp::Mul:
				1527	return "mul";
				1528	case BinaryOp::Div:
				1529	return "div";
				1530	case BinaryOp::Mod:
				1531	return "mod";
				1532	case BinaryOp::Equal:
				1533	return "eq";
				1534	case BinaryOp::Less:
				1535	return "gt";
				1536	case BinaryOp::LessEqual:
				1537	return "gteq";
				1538	case BinaryOp::Greater:
				1539	return "lt";
				1540	case BinaryOp::GreaterEqual:
				1541	return "lte";
				1542	default:
				1543	assert(false);
				1544	}
				1545	}
				1546
				1547	enum class OperandType : int32_t
				1548	{
				1549	Unknown = 0x00000000,
				1550	ScalarFp32 = 0x00001011, // Immediate scalar tile
				1551	ScalarFp16 = 0x00001012, // Immediate scalar tile
				1552	ScalarInt32 = 0x00001021, // Immediate scalar tile
				1553	ScalarInt16 = 0x00001022, // Immediate scalar tile
				1554	ScalarInt8 = 0x00001024, // Immediate scalar tile
				1555	ScalarUInt32 = 0x00001031, // Immediate scalar tile
				1556	ScalarUInt16 = 0x00001032, // Immediate scalar tile
				1557	ScalarUInt8 = 0x00001034, // Immediate scalar tile
				1558	ScalarBool = 0x00001041, // Immediate scalar tile
				1559	ScalarTile = 0x00001050, // Scalar from a tile
				1560	Tile = 0x00010000, // Tile
				1561	TensorStride1 = 0x00100001, // Tensor component
				1562	TensorStride2 = 0x00100002, // Tensor component
				1563	TensorStride3 = 0x00100003, // Tensor component
				1564	TensorStride4 = 0x00100004, // Tensor component
				1565	TensorDim0 = 0x00100010, // Tensor component
				1566	TensorDim1 = 0x00100020, // Tensor component
				1567	TensorDim2 = 0x00100030, // Tensor component
				1568	TensorDim3 = 0x00100040, // Tensor component
				1569	TensorDim4 = 0x00100050, // Tensor component
				1570	TensorC = 0x00100010, // Tensor component
				1571	TensorW = 0x00100020, // Tensor component
				1572	TensorH = 0x00100030, // Tensor component
				1573	TensorD = 0x00100040, // Tensor component
				1574	TensorN = 0x00100050, // Tensor component
				1575	TensorDim1xDim2 = 0x00100100, // Tensor component
				1576	TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
				1577	TensorWxH = 0x00100300, // Tensor component
				1578	TensorWxHxD = 0x00100400, // Tensor component
				1579	TensorDataOffset = 0x00100500, // Tensor component
				1580	};
				1581
				1582	struct ScalarTileCoord
				1583	{
				1584	ScalarTileCoord() {}
				1585	ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
				1586	int32_t x { -1 };
				1587	int32_t y { -1 };
				1588	};
				1589	/**
				1590	* @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
				1591	* Operand can be of three types:
				1592	* -# Scalar immediate: constant expression
				1593	* -# Tile: A tile
				1594	* -# Tensor component: A component (scalar) of a tensor
				1595	*
				1596	*/
				1597	class Operand
				1598	{
				1599	public:
				1600	Operand(const std::string &val)
				1601	{
				1602	_str = val;
				1603	_type = OperandType::Tile;
				1604	}
				1605
				1606	Operand(const std::string &val, const ScalarTileCoord& coord)
				1607	{
				1608	_str = val;
				1609	_type = OperandType::ScalarTile;
				1610	_coord = coord;
				1611	}
				1612
				1613	Operand(const std::string &val, OperandType type)
				1614	{
				1615	_str = val;
				1616	_type = type;
				1617	}
				1618
				1619	Operand(const Operand& t)
				1620	{
				1621	_str = t.value();
				1622	_type = t.type();
				1623	}
				1624
				1625	Operand& operator=(const Operand& t)
				1626	{
				1627	_str = t.value();
				1628	_type = t.type();
				1629	_coord = t.scalar_tile_coordinate();
				1630	return *this;
				1631	}
				1632
				1633	std::string value() const
				1634	{
				1635	return _str;
				1636	}
				1637
				1638	OperandType type() const
				1639	{
				1640	return _type;
				1641	}
				1642
				1643	ScalarTileCoord scalar_tile_coordinate() const
				1644	{
				1645	return _coord;
				1646	}
				1647
				1648	private:
				1649	std::string _str {};
				1650	OperandType _type { OperandType::Unknown };
				1651	ScalarTileCoord _coord {};
				1652	};
				1653
				1654	enum class GpuSamplerTensorStorage : int32_t
				1655	{
				1656	Unknown = static_cast<int32_t>(GpuTensorStorage::Unknown),
				1657	BufferUint8Ptr = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
				1658	Image2dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
				1659	Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
				1660	Image3dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
				1661	Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
				1662	};
				1663
				1664	struct GpuSampler
				1665	{
				1666	GpuSampler() = default;
				1667	TensorSamplerFormat format { TensorSamplerFormat::Unknown };
				1668	GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
				1669	TensorSamplerAddressModeX address_mode_x { TensorSamplerAddressModeX::Unknown };
				1670	TensorSamplerAddressModeY address_mode_y { TensorSamplerAddressModeY::Unknown };
				1671	TensorSamplerAddressModeZ address_mode_z { TensorSamplerAddressModeZ::Unknown };
				1672	};
				1673
				1674	inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
				1675	{
				1676	auto tensor = tensor_info_id->shape;
				1677
				1678	GpuSampler dst_sampler;
				1679	dst_sampler.format = sampler.format;
				1680	dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr;
				1681	dst_sampler.address_mode_x = sampler.address_mode_x;
				1682	dst_sampler.address_mode_y = sampler.address_mode_y;
				1683	dst_sampler.address_mode_z = sampler.address_mode_z;
				1684
				1685	int32_t dim_x = 0;
				1686	int32_t dim_y = 0;
				1687	int32_t dim_z = 0;
				1688
				1689	switch(sampler.format)
				1690	{
				1691	case TensorSamplerFormat::C_W_H:
				1692	dim_x = tensor[0];
				1693	dim_y = tensor[1];
				1694	dim_z = tensor[2];
				1695	break;
				1696	case TensorSamplerFormat::C_WH_1:
				1697	dim_x = tensor[0];
				1698	dim_y = tensor[1] * tensor[2];
				1699	dim_z = 1;
				1700	break;
				1701	default:
				1702	std::cout << "Unsupported tensor format" << std::endl;
				1703	assert(false);
				1704	break;
				1705	}
				1706
				1707	if(dim_x == 1)
				1708	{
				1709	assert(step_x == 1);
				1710	dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
				1711	}
				1712
				1713	if(dim_y == 1)
				1714	{
				1715	assert(step_y == 1);
				1716	dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
				1717	}
				1718
				1719	if(dim_z == 1)
				1720	{
				1721	assert(step_z == 1);
				1722	dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
				1723	}
				1724
				1725	return dst_sampler;
				1726	}
				1727
				1728	class GpuOutputSampler
				1729	{
				1730	public:
				1731	GpuOutputSampler() = default;
				1732	/**
				1733	* @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
				1734	* by the root component. Once initialized, all simpler components will need to used this sampler
				1735	* or a broadcasted version of it
				1736	*
				1737	* @param[in] sampler GpuSampler
				1738	* @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile!
				1739	* @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
				1740	* @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
				1741	*/
				1742	void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
				1743	{
				1744	assert(_is_initialized == false);
				1745
				1746	_step_x = step_x;
				1747	_step_y = step_y;
				1748	_step_z = step_z;
				1749	_tensor_info_id = tensor_info_id;
				1750	_sampler = create_sampler(tensor_storage, tensor_format);
				1751	_is_initialized = true;
				1752	};
				1753
				1754	GpuSampler sampler() const
				1755	{
				1756	return _sampler;
				1757	};
				1758
				1759	int32_t step_x() const
				1760	{
				1761	return _step_x;
				1762	};
				1763
				1764	int32_t step_y() const
				1765	{
				1766	return _step_y;
				1767	};
				1768
				1769	int32_t step_z() const
				1770	{
				1771	return _step_z;
				1772	};
				1773	private:
				1774	GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
				1775	{
				1776	// Output can only be in output mode
				1777	assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
				1778	assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
				1779
				1780	auto tensor = _tensor_info_id->shape;
				1781
				1782	GpuSampler sampler;
				1783	sampler.format = tensor_format;
				1784	sampler.storage = tensor_storage;
				1785	sampler.address_mode_x = TensorSamplerAddressModeX::None;
				1786	sampler.address_mode_y = TensorSamplerAddressModeY::None;
				1787	sampler.address_mode_z = TensorSamplerAddressModeZ::None;
				1788
				1789	// In the case of texture, we do not need any special checks at the border
				1790	if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
				1791	{
				1792	int32_t dim_x = 0;
				1793	int32_t dim_y = 0;
				1794	int32_t dim_z = 0;
				1795
				1796	switch(tensor_format)
				1797	{
				1798	case TensorSamplerFormat::C_W_H:
				1799	dim_x = tensor[0];
				1800	dim_y = tensor[1];
				1801	dim_z = tensor[2];
				1802	break;
				1803	case TensorSamplerFormat::C_WH_1:
				1804	dim_x = tensor[0];
				1805	dim_y = tensor[1] * tensor[2];
				1806	dim_z = 1;
				1807	break;
				1808	default:
				1809	std::cout << "Unsupported tensor format" << std::endl;
				1810	assert(false);
				1811	break;
				1812	}
				1813
				1814	if((dim_x % _step_x) != 0 && dim_x != 1)
				1815	{
				1816	sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
				1817	}
				1818
				1819	if((dim_y % _step_y) != 0 && dim_y != 1)
				1820	{
				1821	sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
				1822	}
				1823
				1824	if((dim_z % _step_z) != 0 && dim_z != 1)
				1825	{
				1826	sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
				1827	}
				1828	}
				1829
				1830	return sampler;
				1831	}
				1832	GpuSampler _sampler { }; // GpuSampler
				1833	int32_t _step_x { 1 };
				1834	int32_t _step_y { 1 };
				1835	int32_t _step_z { 1 };
				1836	const TensorInfo* _tensor_info_id { nullptr };
				1837	bool _is_initialized { false };
				1838	};
				1839
				1840	/**
				1841	* @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
				1842	*/
				1843	class TensorOperand
				1844	{
				1845	public:
				1846	TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
				1847	{
				1848	}
				1849
				1850	TensorOperand& operator=(const TensorOperand& t)
				1851	{
				1852	_str = t.value();
				1853	_sampler = t.sampler();
				1854	return *this;
				1855	}
				1856
				1857	std::string value() const
				1858	{
				1859	return _str;
				1860	}
				1861
				1862	GpuSampler sampler() const
				1863	{
				1864	return _sampler;
				1865	}
				1866
				1867	private:
				1868	std::string _str {};
				1869	GpuSampler _sampler {};
				1870	};
				1871
				1872	/**
				1873	* @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
				1874	* This data structure must be initialized before being passed to the Gpu Kernel Writer
				1875	*
				1876	*/
				1877	class GpuKernelWriterDataHolder
				1878	{
				1879	public:
				1880	/**
				1881	* @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
				1882	* the GPU target and target specific capabilities (extensions). For now, we just initialize the
				1883	* programming language
				1884	*
				1885	* @param[in] language Gpu programming language to use
				1886	*/
				1887	GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
				1888	{
				1889	}
				1890	/**
				1891	* @brief Get the Gpu programming language used
				1892	*
				1893	* @return GpuTargetLanguage the Gpu programming language
				1894	*/
				1895	GpuTargetLanguage programming_language() const
				1896	{
				1897	return _language;
				1898	}
				1899	/**
				1900	* @brief @ref GpuTileRegistry
				1901	*
				1902	*/
				1903	GpuTileRegistry tiles{};
				1904	/**
				1905	* @brief @ref GpuTensorArgumentRegistry
				1906	*
				1907	*/
				1908	GpuTensorArgumentRegistry arguments{};
				1909	/**
				1910	* @brief @ref GpuOutputSampler.
				1911	*
				1912	*/
				1913	GpuOutputSampler output_sampler{};
				1914	/**
				1915	* @brief Source code
				1916	*
				1917	*/
				1918	std::string code{};
				1919
				1920	// GpuExtensionRegistry extensions{};
				1921	private:
				1922	GpuTargetLanguage _language;
				1923	};
				1924
				1925	struct LWS
				1926	{
				1927	int32_t x {1};
				1928	int32_t y {1};
				1929	int32_t z {1};
				1930	};
				1931
				1932	/**
				1933	* @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
				1934	* declare an anonymous tile in the tile registry.
				1935	*/
				1936	class OperandUnpacker
				1937	{
				1938	public:
				1939	OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
				1940	{
				1941	// Increase the level of the stack to allocate possible temporary tiles
				1942	_tiles.increment_registry_level();
				1943	};
				1944
				1945	~OperandUnpacker()
				1946	{
				1947	// Decrease the level of the stack to deallocate any temporary tiles
				1948	_tiles.decrement_registry_level();
				1949	}
				1950
				1951	IVectorTile* unpack(const Operand& src)
				1952	{
				1953	// Get the tile
				1954	if(src.type() == OperandType::Tile)
				1955	{
				1956	assert(_tiles.has_tile(src.value()));
				1957	return _tiles[src.value()];
				1958	}
				1959	// Create an anonymous tile with a constant
				1960	else if(static_cast<int32_t>(src.type()) & 0x00001000)
				1961	{
				1962	if(src.type() == OperandType::ScalarTile)
				1963	{
				1964	ScalarTileCoord coord = src.scalar_tile_coordinate();
				1965	assert(_tiles.has_tile(src.value()));
				1966	assert(coord.x >= 0);
				1967	assert(coord.y >= 0);
				1968	auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
				1969	return _tiles.insert({{{val.str}}}, val.type.dt);
				1970	}
				1971	else
				1972	{
				1973	return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
				1974	}
				1975	}
				1976	// Create an anonymous tile with the tensor component
				1977	else
				1978	{
				1979	assert(_arguments.has_tensor_argument(src.value()));
				1980	auto x = _arguments[src.value()];
				1981	const std::string val = x->component(to_tensor_component(src.type()));
				1982	const DataType dt = x->component_data_type();
				1983	return _tiles.insert({{{val}}}, dt);
				1984	}
				1985	}
				1986
				1987	private:
				1988	DataType to_tile_data_type(OperandType x)
				1989	{
				1990	return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
				1991	}
				1992
				1993	TensorComponent to_tensor_component(OperandType x)
				1994	{
				1995	switch(x)
				1996	{
				1997	case OperandType::TensorDim0:
				1998	return TensorComponent::Dim0;
				1999	case OperandType::TensorDim1:
				2000	return TensorComponent::Dim1;
				2001	case OperandType::TensorDim2:
				2002	return TensorComponent::Dim2;
				2003	case OperandType::TensorDim3:
				2004	return TensorComponent::Dim3;
				2005	case OperandType::TensorDim4:
				2006	return TensorComponent::Dim4;
				2007	case OperandType::TensorStride1:
				2008	return TensorComponent::Stride1;
				2009	case OperandType::TensorStride2:
				2010	return TensorComponent::Stride2;
				2011	case OperandType::TensorStride3:
				2012	return TensorComponent::Stride3;
				2013	case OperandType::TensorStride4:
				2014	return TensorComponent::Stride4;
				2015	case OperandType::TensorDim1xDim2:
				2016	return TensorComponent::Dim1xDim2;
				2017	case OperandType::TensorDim1xDim2xDim3:
				2018	return TensorComponent::Dim1xDim2xDim3;
				2019	case OperandType::TensorDataOffset:
				2020	return TensorComponent::OffsetFirstElement;
				2021	default:
				2022	assert(false);
				2023	}
				2024	}
				2025
				2026	GpuTileRegistry& _tiles;
				2027	GpuTensorArgumentRegistry& _arguments;
				2028	};
				2029
				2030	/**
				2031	* @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
				2032	* declare an anonymous tile in the tile registry.
				2033	* Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
				2034	*/
				2035	class TensorOperandUnpacker
				2036	{
				2037	public:
				2038	TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
				2039	{
				2040	};
				2041
				2042	IGpuTensorArgument* unpack(const TensorOperand& src)
				2043	{
				2044	assert(_arguments.has_tensor_argument(src.value()));
				2045	return _arguments[src.value()];
				2046	}
				2047
				2048	private:
				2049	GpuTensorArgumentRegistry& _arguments;
				2050	};
				2051
				2052	/**
				2053	* @brief The GpuKernel will be used in three occasions (stages):
				2054	* #- Compilation stage
				2055	* #- Tuning stage
				2056	* #- Dispatch stage
				2057	*/
				2058	struct GpuKernel
				2059	{
				2060	// Compilation stage
				2061	std::string code {}; // Source code, required for the compilation stage
				2062	std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
				2063	// Tuning stage
				2064	std::string config_id {}; // Unique id, required for the tuning stage
				2065	std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage
				2066	// Dispatch stage
				2067	GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
				2068	std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages; // List of tensor storages, required for the dispatch stage
				2069	std::vector<std::pair<int32_t, TensorComponent>> list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
				2070	};
				2071
				2072	// This function should produce an object with the source
				2073	inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
				2074	{
				2075	std::string code;
				2076	code += "__kernel void ";
				2077	code += name;
				2078	code += "(\n";
				2079
				2080	auto IdSpaces = in.arguments.IdSpace_declarations();
				2081
				2082	std::vector<std::string> arg_str;
				2083
				2084	auto tensor_args = in.arguments.tensor_argument_declarations();
				2085
				2086	for(auto &i : tensor_args)
				2087	{
				2088	// For each tensor used, get the storage and tensor components
				2089	auto storages = i->storage_declarations();
				2090	auto components = i->component_declarations();
				2091
				2092	for(auto &y : storages)
				2093	{
				2094	std::string str;
				2095	str += i->storage_type_declaration(y);
				2096	str += " ";
				2097	str += i->storage(y);
				2098	arg_str.push_back(str);
				2099	}
				2100
				2101	for(auto &y : components)
				2102	{
				2103	std::string str;
				2104	str += i->component_type_declaration();
				2105	str += " ";
				2106	str += i->component(y);
				2107	arg_str.push_back(str);
				2108	}
				2109	}
				2110
				2111	for(size_t i = 0; i < arg_str.size(); ++i)
				2112	{
				2113	code += arg_str[i];
				2114	if(i + 1 < arg_str.size())
				2115	{
				2116	code += ",\n";
				2117	}
				2118	}
				2119
				2120	code += ")\n";
				2121	code += "{\n";
				2122	code += in.code;
				2123	code += "}\n";
				2124
				2125	return code;
				2126	};
				2127
				2128	/**
				2129	* @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
				2130	* how to reduce the dimensionality of a tensor
				2131	*
				2132	*/
				2133	class GpuTensor3dMapper
				2134	{
				2135	public:
				2136	GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
				2137	{
				2138	};
				2139
				2140	std::string tensor_component_x() const
				2141	{
				2142	const auto format = _sampler.format;
				2143	switch(format)
				2144	{
				2145	case TensorSamplerFormat::C_WH_1:
				2146	case TensorSamplerFormat::C_W_H:
				2147	return _tensor->component(TensorComponent::C);
				2148	default:
				2149	std::cout << "Unsupported tensor format" << std::endl;
				2150	assert(false);
				2151	}
				2152	}
				2153
				2154	std::string tensor_component_y() const
				2155	{
				2156	const auto format = _sampler.format;
				2157	switch(format)
				2158	{
				2159	case TensorSamplerFormat::C_WH_1:
				2160	return _tensor->component(TensorComponent::WxH);
				2161	case TensorSamplerFormat::C_W_H:
				2162	return _tensor->component(TensorComponent::W);
				2163	default:
				2164	std::cout << "Unsupported tensor format" << std::endl;
				2165	assert(false);
				2166	}
				2167	}
				2168
				2169	std::string tensor_component_z() const
				2170	{
				2171	const auto format = _sampler.format;
				2172	switch(format)
				2173	{
				2174	case TensorSamplerFormat::C_WH_1:
				2175	return "1";
				2176	case TensorSamplerFormat::C_W_H:
				2177	return _tensor->component(TensorComponent::H);
				2178	default:
				2179	std::cout << "Unsupported tensor format" << std::endl;
				2180	assert(false);
				2181	}
				2182	}
				2183
				2184	std::string tensor_component_stride_y() const
				2185	{
				2186	const auto format = _sampler.format;
				2187	switch(format)
				2188	{
				2189	case TensorSamplerFormat::C_WH_1:
				2190	case TensorSamplerFormat::C_W_H:
				2191	return _tensor->component(TensorComponent::Stride1);
				2192	default:
				2193	std::cout << "Unsupported tensor format" << std::endl;
				2194	assert(false);
				2195	}
				2196	}
				2197
				2198	std::string tensor_component_stride_z() const
				2199	{
				2200	const auto format = _sampler.format;
				2201	switch(format)
				2202	{
				2203	case TensorSamplerFormat::C_WH_1:
				2204	return "0";
				2205	case TensorSamplerFormat::C_W_H:
				2206	return _tensor->component(TensorComponent::Stride2);
				2207	default:
				2208	std::cout << "Unsupported tensor format" << std::endl;
				2209	assert(false);
				2210	}
				2211	}
				2212
				2213	std::string tensor_component_stride_batch() const
				2214	{
				2215	const auto format = _sampler.format;
				2216	switch(format)
				2217	{
				2218	case TensorSamplerFormat::C_WH_1:
				2219	case TensorSamplerFormat::C_W_H:
				2220	return _tensor->component(TensorComponent::Stride3);
				2221	default:
				2222	std::cout << "Unsupported tensor format" << std::endl;
				2223	assert(false);
				2224	}
				2225	}
				2226
				2227	bool is_one_component_x() const
				2228	{
				2229	auto t = _tensor->format();
				2230	const auto format = _sampler.format;
				2231	switch(format)
				2232	{
				2233	case TensorSamplerFormat::C_WH_1:
				2234	case TensorSamplerFormat::C_W_H:
				2235	return t.shape[0] == 1;
				2236	default:
				2237	std::cout << "Unsupported tensor format" << std::endl;
				2238	assert(false);
				2239	}
				2240	}
				2241
				2242	bool is_one_component_y() const
				2243	{
				2244	auto t = _tensor->format();
				2245	const auto format = _sampler.format;
				2246	switch(format)
				2247	{
				2248	case TensorSamplerFormat::C_WH_1:
				2249	return (t.shape[1] * t.shape[2]) == 1;
				2250	case TensorSamplerFormat::C_W_H:
				2251	return t.shape[1] == 1;
				2252	default:
				2253	std::cout << "Unsupported tensor format" << std::endl;
				2254	assert(false);
				2255	}
				2256	}
				2257
				2258	bool is_one_component_z() const
				2259	{
				2260	auto t = _tensor->format();
				2261	const auto format = _sampler.format;
				2262	switch(format)
				2263	{
				2264	case TensorSamplerFormat::C_WH_1:
				2265	return true;
				2266	case TensorSamplerFormat::C_W_H:
				2267	return t.shape[2] == 1;
				2268	default:
				2269	std::cout << "Unsupported tensor format" << std::endl;
				2270	assert(false);
				2271	}
				2272	}
				2273
				2274	bool is_one_component_batch() const
				2275	{
				2276	auto t = _tensor->format();
				2277	const auto format = _sampler.format;
				2278	switch(format)
				2279	{
				2280	case TensorSamplerFormat::C_WH_1:
				2281	case TensorSamplerFormat::C_W_H:
				2282	return t.shape[3] == 1;
				2283	default:
				2284	std::cout << "Unsupported tensor format" << std::endl;
				2285	assert(false);
				2286	}
				2287	}
				2288
				2289	GpuSampler gpu_sampler() const
				2290	{
				2291	return _sampler;
				2292	}
				2293
				2294	IGpuTensorArgument* tensor_argument() const
				2295	{
				2296	return _tensor;
				2297	}
				2298
				2299	private:
				2300	GpuSampler _sampler;
				2301	IGpuTensorArgument* _tensor;
				2302	};
				2303
				2304	struct GpuKernelWriterAttribute
				2305	{
				2306	bool return_tensor_component_by_value { false };
				2307	};
				2308
				2309	enum class ConvertPolicy
				2310	{
				2311	Wrap, /*< Wrap around /
				2312	Saturate /*< Saturate /
				2313	};
				2314
				2315	enum class RoundingMode
				2316	{
				2317	None,
				2318	Rte,
				2319	Rtz,
				2320	Rtp,
				2321	Rtn
				2322	};
				2323
				2324	// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
				2325	class IGpuKernelWriter
				2326	{
				2327	public:
				2328	virtual ~IGpuKernelWriter() = default;
				2329	virtual void set_IdSpace(int32_t id) = 0;
				2330	virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
				2331	virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
				2332	virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
				2333	virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
				2334	virtual void write_text(const std::string& x) = 0;
				2335	virtual void compound_statement_begin() = 0;
				2336	virtual void compound_statement_end() = 0;
				2337
				2338	// Operations
				2339	virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
				2340	virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
				2341	virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
				2342	virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
				2343	virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
				2344	virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
				2345	virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
				2346	virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
				2347	virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
				2348	virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
				2349	virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
				2350	virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
				2351	virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
				2352	virtual void op_return() = 0;
				2353	// virtual void op_else() = 0;
				2354	// virtual void op_elseif() = 0;
				2355	// Utils
				2356	// It is the process of converting
				2357	virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
				2358	};
				2359
				2360	enum class GpuLoadStoreType
				2361	{
				2362	Load = 1,
				2363	Store = 2
				2364	};
				2365
				2366	class IGpuLoadStoreHelperWriter
				2367	{
				2368	public:
				2369	IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
				2370	IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
				2371	IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
				2372	virtual ~IGpuLoadStoreHelperWriter() = default;
				2373	virtual void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) = 0;
				2374	virtual void write(const std::pair<int32_t, std::string>& y) = 0;
				2375	virtual void finalize() = 0;
				2376	protected:
				2377	IGpuKernelWriter* _writer;
				2378	GpuTensor3dMapper _mapper;
				2379	GpuLoadStoreType _type;
				2380	};
				2381
				2382	class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
				2383	{
				2384	public:
				2385	ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
				2386	{
				2387	}
				2388
				2389	ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
				2390	ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
				2391
				2392	static bool validate(IGpuKernelWriter x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile dst)
				2393	{
				2394	CKW_UNUSED(x, type, dst);
				2395
				2396	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
				2397	{
				2398	return false;
				2399	}
				2400	return true;
				2401	}
				2402
				2403	void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) override
				2404	{
				2405	assert(validate(_writer, _mapper, _type, dst));
				2406
				2407	_dst = dst;
				2408	_ls_width_full = dst->format().w;
				2409
				2410	_coord_x = x->scalar(0, 0).str;
				2411	_coord_z = z->scalar(0, 0).str;
				2412	_coord_b = b->scalar(0, 0).str;
				2413	_coord_orig_z = _coord_z;
				2414
				2415	out_of_bound_initialize_x(_coord_x);
				2416	out_of_bound_initialize_z(_coord_z);
				2417
				2418	/*
				2419	meaning of else:
				2420	- x: partial load/store
				2421	- y: no load/store operation
				2422	- z: no load/store operation
				2423	if(x)
				2424	{
				2425	if(z)
				2426	{
				2427	if(y)
				2428	{
				2429	// full load/store width
				2430	}
				2431	else
				2432	{
				2433	// no load/store
				2434	}
				2435	}
				2436	else
				2437	{
				2438	// no load/store
				2439	}
				2440	}
				2441	else
				2442	{
				2443	if(z)
				2444	{
				2445	if(y)
				2446	{
				2447	// partial load/store width
				2448	}
				2449	else
				2450	{
				2451	// no load/store
				2452	}
				2453	}
				2454	else
				2455	{
				2456	// no load/store
				2457	}
				2458	}
				2459	*/
				2460	}
				2461
				2462	void write(const std::pair<int32_t, std::string>& y) override
				2463	{
				2464	int32_t idx_y = y.first;
				2465	std::string coord_y = y.second;
				2466
				2467	// The only check required is on Y.
				2468	out_of_bound_initialize_y(coord_y);
				2469
				2470	const std::string dst = _dst->vector(idx_y).str;
				2471	const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
				2472	const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address);
				2473
				2474	_writer->write_text(ls_buf);
				2475	_writer->write_text(";\n");
				2476
				2477	out_of_bound_finalize_y(dst);
				2478
				2479	// The left over load/store will be written in the finalize stage
				2480	if(_ls_width_part.size() != 0)
				2481	{
				2482	int32_t w = 0;
				2483	for(auto &p : _ls_width_part)
				2484	{
				2485	const std::string dst0 = _dst->vector(w, p, idx_y).str;
				2486	const std::string coord_x = _coord_x + " + " + std::to_string(w);
				2487	const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
				2488	const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
				2489	_leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
				2490
				2491	w += p;
				2492	}
				2493	}
				2494	}
				2495
				2496	void finalize() override
				2497	{
				2498	out_of_bound_finalize_z();
				2499	out_of_bound_finalize_x();
				2500	}
				2501	private:
				2502	IVectorTile* _dst { nullptr };
				2503	int32_t _ls_width_full { 0 };
				2504	std::vector<int32_t> _ls_width_part { };
				2505	std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
				2506	std::string _coord_x {};
				2507	std::string _coord_z {};
				2508	std::string _coord_orig_z {};
				2509	std::string _coord_b {};
				2510
				2511	void out_of_bound_initialize_x(std::string& coord)
				2512	{
				2513	if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				2514	{
				2515	auto tensor_format = _mapper.tensor_argument()->format();
				2516	auto shape = tensor_format.shape;
				2517
				2518	_ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
				2519	if(_ls_width_part.size() != 0)
				2520	{
				2521	_writer->write_text("if(" + coord + " > 0)\n");
				2522	_writer->compound_statement_begin();
				2523	}
				2524	}
				2525	};
				2526
				2527	void out_of_bound_finalize_x()
				2528	{
				2529	if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				2530	{
				2531	if(_ls_width_part.size() != 0)
				2532	{
				2533	_writer->compound_statement_end();
				2534	_writer->write_text("else\n");
				2535	_writer->compound_statement_begin();
				2536
				2537	out_of_bound_initialize_z(_coord_orig_z);
				2538	for(auto &i : _leftovers_x)
				2539	{
				2540	out_of_bound_initialize_y(i.first.second);
				2541	_writer->write_text(i.second);
				2542	_writer->write_text(";\n");
				2543	out_of_bound_finalize_y(i.first.first);
				2544	}
				2545	out_of_bound_finalize_z();
				2546	_writer->compound_statement_end();
				2547	}
				2548	}
				2549	};
				2550
				2551	void out_of_bound_initialize_y(std::string& coord)
				2552	{
				2553	std::string max = "";
				2554
				2555	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2556
				2557	switch(address_mode_y)
				2558	{
				2559	case TensorSamplerAddressModeY::Skip:
				2560	case TensorSamplerAddressModeY::ClampToBorder:
				2561	// NOTE: This line should not be moved outside of the switch statement.
				2562	// The reason for that is because when we query the component, the component is marked as used
				2563	// and added to the list of arguments of the kernel. Since, not in all cases this component is required,
				2564	// we should request the component only when used
				2565	max = _mapper.tensor_component_y();
				2566	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2567	_writer->compound_statement_begin();
				2568	break;
				2569	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2570	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2571	_writer->write_text("if(" + coord + " >= 0)\n");
				2572	_writer->compound_statement_begin();
				2573	break;
				2574	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2575	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2576	max = _mapper.tensor_component_y();
				2577	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2578	_writer->compound_statement_begin();
				2579	break;
				2580	case TensorSamplerAddressModeY::ClampToNearest:
				2581	max = _mapper.tensor_component_y();
				2582	coord = "clamp(" + coord + ", 0, " + max + " - 1)";
				2583	break;
				2584	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				2585	max = _mapper.tensor_component_y();
				2586	coord = "min(" + coord + ", " + max + " - 1)";
				2587	break;
				2588	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				2589	coord = "max(" + coord + ", 0)";
				2590	break;
				2591	case TensorSamplerAddressModeY::None:
				2592	break;
				2593	default:
				2594	std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
				2595	assert(false);
				2596	}
				2597	};
				2598
				2599	void out_of_bound_finalize_y(const std::string& dst)
				2600	{
				2601	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2602
				2603	switch(address_mode_y)
				2604	{
				2605	case TensorSamplerAddressModeY::ClampToBorder:
				2606	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2607	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2608	case TensorSamplerAddressModeY::Skip:
				2609	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2610	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2611	_writer->compound_statement_end();
				2612	break;
				2613
				2614	default:
				2615	assert(false);
				2616	}
				2617
				2618	switch(address_mode_y)
				2619	{
				2620	case TensorSamplerAddressModeY::ClampToBorder:
				2621	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2622	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2623	_writer->write_text("else\n");
				2624	_writer->compound_statement_begin();
				2625	_writer->write_text(dst);
				2626	_writer->write_text(" = 0.0f;\n");
				2627	_writer->compound_statement_end();
				2628	break;
				2629
				2630	default:
				2631	assert(false);
				2632	}
				2633	};
				2634
				2635	void out_of_bound_initialize_z(std::string& coord)
				2636	{
				2637	std::string max = "";
				2638
				2639	const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
				2640
				2641	switch(address_mode_z)
				2642	{
				2643	case TensorSamplerAddressModeZ::Skip:
				2644	max = _mapper.tensor_component_z();
				2645	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2646	_writer->compound_statement_begin();
				2647	break;
				2648	case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
				2649	_writer->write_text("if(" + coord + " >= 0)\n");
				2650	_writer->compound_statement_begin();
				2651	break;
				2652	case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
				2653	max = _mapper.tensor_component_z();
				2654	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2655	_writer->compound_statement_begin();
				2656	break;
				2657	case TensorSamplerAddressModeZ::ClampToNearest:
				2658	max = _mapper.tensor_component_z();
				2659	coord = "clamp(" + coord + ", 0, " + max + " - 1)";
				2660	break;
				2661	case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
				2662	max = _mapper.tensor_component_z();
				2663	coord = "min(" + coord + ", " + max + " - 1)";
				2664	break;
				2665	case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
				2666	coord = "max(" + coord + ", 0)";
				2667	break;
				2668	case TensorSamplerAddressModeZ::None:
				2669	break;
				2670	default:
				2671	std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
				2672	assert(false);
				2673	}
				2674	};
				2675
				2676	void out_of_bound_finalize_z()
				2677	{
				2678	const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
				2679
				2680	switch(address_mode_z)
				2681	{
				2682	case TensorSamplerAddressModeZ::Skip:
				2683	case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
				2684	case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
				2685	_writer->compound_statement_end();
				2686	break;
				2687
				2688	default:
				2689	assert(false);
				2690	}
				2691	};
				2692
				2693	std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
				2694	{
				2695	std::vector<int32_t> x;
				2696
				2697	switch(ls_leftover_vector_width)
				2698	{
				2699	case 0:
				2700	break;
				2701	case 1:
				2702	case 2:
				2703	case 3:
				2704	case 4:
				2705	case 8:
				2706	case 16:
				2707	x.push_back(ls_leftover_vector_width);
				2708	break;
				2709	case 5:
				2710	x.push_back(4);
				2711	x.push_back(1);
				2712	break;
				2713	case 6:
				2714	x.push_back(4);
				2715	x.push_back(2);
				2716	break;
				2717	case 7:
				2718	x.push_back(4);
				2719	x.push_back(3);
				2720	break;
				2721	case 9:
				2722	x.push_back(8);
				2723	x.push_back(1);
				2724	break;
				2725	case 10:
				2726	x.push_back(8);
				2727	x.push_back(2);
				2728	break;
				2729	case 11:
				2730	x.push_back(8);
				2731	x.push_back(3);
				2732	break;
				2733	case 12:
				2734	x.push_back(8);
				2735	x.push_back(4);
				2736	break;
				2737	case 13:
				2738	x.push_back(8);
				2739	x.push_back(4);
				2740	x.push_back(1);
				2741	break;
				2742	case 14:
				2743	x.push_back(8);
				2744	x.push_back(4);
				2745	x.push_back(2);
				2746	break;
				2747	case 15:
				2748	x.push_back(8);
				2749	x.push_back(4);
				2750	x.push_back(3);
				2751	break;
				2752
				2753	default:
				2754	assert(false);
				2755	}
				2756	return x;
				2757	}
				2758
				2759	std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
				2760	{
				2761	switch(type)
				2762	{
				2763	case GpuLoadStoreType::Load:
				2764	if(vector_width != 1)
				2765	{
				2766	return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
				2767	}
				2768	else
				2769	{
				2770	return data + " = *(" + address + ")";
				2771	}
				2772	break;
				2773	case GpuLoadStoreType::Store:
				2774	if(vector_width != 1)
				2775	{
				2776	return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
				2777	}
				2778	else
				2779	{
				2780	return "*(" + address + ") = " + data;
				2781	}
				2782	break;
				2783	default:
				2784	std::cout << "Unsupported GpuLoadStoreType" << std::endl;
				2785	assert(false);
				2786	}
				2787	}
				2788
				2789	std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
				2790	{
				2791	auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
				2792	assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
				2793	const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage);
				2794	const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
				2795
				2796	std::string address;
				2797	address += "(__global ";
				2798	address += dst_type;
				2799	address += "*)(";
				2800	address += ptr_buf;
				2801	if(x != "0" && (_mapper.is_one_component_x() != true))
				2802	{
				2803	address += " + (";
				2804	address += x + ") * sizeof(" + dst_type + ")";
				2805	}
				2806	if(y != "0" && (_mapper.is_one_component_y() != true))
				2807	{
				2808	const std::string stride_y = _mapper.tensor_component_stride_y();
				2809	address += " + (";
				2810	address += y + ")";
				2811	address += " * ";
				2812	address += stride_y;
				2813	}
				2814	if(z != "0" && (_mapper.is_one_component_z() != true))
				2815	{
				2816	const std::string stride_z = _mapper.tensor_component_stride_z();
				2817	address += " + (";
				2818	address += z + ")";
				2819	address += " * ";
				2820	address += stride_z;
				2821	}
				2822	if(b != "0" && (_mapper.is_one_component_batch() != true))
				2823	{
				2824	const std::string stride_b = _mapper.tensor_component_stride_batch();
				2825	address += " + (";
				2826	address += b + ")";
				2827	address += " * ";
				2828	address += stride_b;
				2829	}
				2830	address += ")";
				2831	return address;
				2832	}
				2833	};
				2834
				2835	class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
				2836	{
				2837	public:
				2838	static bool validate(IGpuKernelWriter x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile dst)
				2839	{
				2840	CKW_UNUSED(x);
				2841
				2842	if(dst->format().w != 4)
				2843	{
				2844	return false;
				2845	}
				2846	if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
				2847	{
				2848	return false;
				2849	}
				2850	if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
				2851	{
				2852	return false;
				2853	}
				2854	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
				2855	{
				2856	return false;
				2857	}
				2858	if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
				2859	{
				2860	return false;
				2861	}
				2862	if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
				2863	{
				2864	return false;
				2865	}
				2866	return true;
				2867	/*
				2868	- x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
				2869	- z: Only GpuSamplerAddressModeZ::None is supported
				2870	*/
				2871	}
				2872	ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
				2873	{
				2874	}
				2875
				2876	ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
				2877	ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
				2878
				2879	void initialize(IVectorTile dst, IVectorTile x, IVectorTile z, IVectorTile b) override
				2880	{
				2881	assert(validate(_writer, _mapper, _type, dst));
				2882
				2883	_dst = dst;
				2884	_ls_width_full = dst->format().w;
				2885	_coord_x = x->scalar(0, 0).str;
				2886	_coord_z = z->scalar(0, 0).str;
				2887	_coord_b = b->scalar(0, 0).str;
				2888
				2889	/*
				2890	if(y)
				2891	{
				2892	// full load/store width
				2893	}
				2894	else
				2895	{
				2896	// no load/store
				2897	}
				2898	*/
				2899	}
				2900
				2901	void write(const std::pair<int32_t, std::string>& y) override
				2902	{
				2903	int32_t idx_y = y.first;
				2904	std::string coord_y = y.second;
				2905
				2906	// The only check required is on Y.
				2907	out_of_bound_initialize_y(coord_y);
				2908
				2909	const std::string dst = _dst->vector(idx_y).str;
				2910	const std::string sampler = to_ls_image2d_sampler();
				2911	const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
				2912	const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
				2913
				2914	_writer->write_text(ls_buf);
				2915	_writer->write_text(";\n");
				2916
				2917	out_of_bound_finalize_y(dst);
				2918	}
				2919
				2920	void finalize() override
				2921	{
				2922	}
				2923	private:
				2924	IVectorTile* _dst { nullptr };
				2925	int32_t _ls_width_full { 0 };
				2926	std::string _coord_x {};
				2927	std::string _coord_z {};
				2928	std::string _coord_b {};
				2929
				2930	void out_of_bound_initialize_y(std::string& coord)
				2931	{
				2932	std::string max = "";
				2933
				2934	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2935
				2936	switch(address_mode_y)
				2937	{
				2938	case TensorSamplerAddressModeY::Skip:
				2939	max = _mapper.tensor_component_y();
				2940	_writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
				2941	_writer->compound_statement_begin();
				2942	break;
				2943	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2944	_writer->write_text("if(" + coord + " >= 0)\n");
				2945	_writer->compound_statement_begin();
				2946	break;
				2947	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2948	max = _mapper.tensor_component_y();
				2949	_writer->write_text("if(" + coord + " < " + max + ")\n");
				2950	_writer->compound_statement_begin();
				2951	break;
				2952	case TensorSamplerAddressModeY::ClampToBorder:
				2953	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				2954	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				2955	case TensorSamplerAddressModeY::ClampToNearest:
				2956	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				2957	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				2958	case TensorSamplerAddressModeY::None:
				2959	break;
				2960	default:
				2961	std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
				2962	assert(false);
				2963	}
				2964	};
				2965
				2966	void out_of_bound_finalize_y(const std::string& dst)
				2967	{
				2968	CKW_UNUSED(dst);
				2969
				2970	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				2971
				2972	switch(address_mode_y)
				2973	{
				2974	case TensorSamplerAddressModeY::Skip:
				2975	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				2976	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				2977	_writer->compound_statement_end();
				2978	break;
				2979
				2980	default:
				2981	assert(false);
				2982	}
				2983	};
				2984
				2985	std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
				2986	{
				2987	CKW_UNUSED(vector_width);
				2988
				2989	auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
				2990	const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage);
				2991	// const DataType dt = _dst->format().dt;
				2992	const std::string post_fix = _dst->format().dt == DataType::Fp32? "f" : "h";
				2993
				2994	switch(type)
				2995	{
				2996	case GpuLoadStoreType::Load:
				2997	return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
				2998	break;
				2999	case GpuLoadStoreType::Store:
				3000	return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
				3001	default:
				3002	assert(false);
				3003	std::cout << "Unsupported GpuLoadStoreType" << std::endl;
				3004	assert(false);
				3005	}
				3006	}
				3007
				3008	std::string to_ls_image2d_sampler() const
				3009	{
				3010	const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
				3011
				3012	switch(address_mode_y)
				3013	{
				3014	case TensorSamplerAddressModeY::None:
				3015	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST";
				3016	case TensorSamplerAddressModeY::Skip:
				3017	case TensorSamplerAddressModeY::SkipMinEdgeOnly:
				3018	case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
				3019	case TensorSamplerAddressModeY::ClampToBorder:
				3020	case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
				3021	case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
				3022	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP \| CLK_FILTER_NEAREST";
				3023	case TensorSamplerAddressModeY::ClampToNearest:
				3024	case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
				3025	case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
				3026	return "CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST";
				3027	default:
				3028	std::cout << "Unsupported address_mode_coord" << std::endl;
				3029	assert(false);
				3030	}
				3031	}
				3032
				3033	std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
				3034	{
				3035	std::string coord_x = "(" + x + ") >> 2";
				3036	std::string coord_y = "(";
				3037
				3038	if(y != "0" && (_mapper.is_one_component_y() != true))
				3039	{
				3040	coord_y += y;
				3041	}
				3042	if(z != "0" && (_mapper.is_one_component_z() != true))
				3043	{
				3044	const std::string dim = _mapper.tensor_component_y();
				3045	coord_y += " + (";
				3046	coord_y += z + ")";
				3047	coord_y += " * ";
				3048	coord_y += dim;
				3049	}
				3050	if(b != "0" && (_mapper.is_one_component_batch() != true))
				3051	{
				3052	const std::string dim0 = _mapper.tensor_component_y();
				3053	const std::string dim1 = _mapper.tensor_component_z();
				3054	coord_y += " + (";
				3055	coord_y += b + ")";
				3056	coord_y += " * ";
				3057	coord_y += dim0;
				3058	coord_y += " * ";
				3059	coord_y += dim1;
				3060	}
				3061	coord_y += ")";
				3062	return "(int2)(" + coord_x + ", " + coord_y + ")";
				3063	}
				3064	};
				3065
				3066	/** IGpuLoadStoreHelperWriter factory class */
				3067	class ClLoadStoreHelperWriterFactory final
				3068	{
				3069	public:
				3070	/** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
				3071	*
				3072	*
				3073	* @return IGpuLoadStoreHelperWriter
				3074	*/
				3075	static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
				3076	{
				3077	const auto tensor_storage = mapper.gpu_sampler().storage;
				3078	switch(tensor_storage)
				3079	{
				3080	case GpuSamplerTensorStorage::BufferUint8Ptr:
				3081	return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
				3082	case GpuSamplerTensorStorage::Image2dReadOnly:
				3083	case GpuSamplerTensorStorage::Image2dWriteOnly:
				3084	return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
				3085	default:
				3086	std::cout << "Unsupported Gpu tensor storage" << std::endl;
				3087	assert(false);
				3088	}
				3089	}
				3090	};
				3091
				3092	// This utility method needs to go in utils.h
				3093	inline bool is_tile_scalar(IVectorTile* x)
				3094	{
				3095	return x->format().w == 1 && x->format().h == 1;
				3096	}
				3097
				3098	class ClKernelWriter : public IGpuKernelWriter
				3099	{
				3100	public:
				3101	ClKernelWriter(GpuKernelWriterAttribute attr, GpuKernelWriterDataHolder x)
				3102	{
				3103	_data = x;
				3104	_attr = attr;
				3105	}
				3106
				3107	ClKernelWriter(const ClKernelWriter &) = default;
				3108	ClKernelWriter &operator=(const ClKernelWriter &) = default;
				3109
				3110	// A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
				3111	// there are no conflicts or ambiguity in the code
				3112	void set_IdSpace(int32_t id) override
				3113	{
				3114	_data->tiles.set_IdSpace(id);
				3115	_data->arguments.set_IdSpace(id);
				3116	}
				3117
				3118	void import_tile(const std::string& dst_name, const IVectorTile *src) override
				3119	{
				3120	_data->tiles.insert(dst_name, src);
				3121	}
				3122
				3123	void declare_argument(const std::string& name, const TensorInfo& tensor) override
				3124	{
				3125	assert(_data->arguments[name] == nullptr);
				3126	_data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
				3127	}
				3128
				3129	void declare_tile(const std::string& name, const TileInfo& format) override
				3130	{
				3131	assert(_data->tiles[name] == nullptr);
				3132	_data->tiles.insert(name, format);
				3133
				3134	IVectorTile *x = _data->tiles[name];
				3135
				3136	for(auto &t : x->underlying_source_variables())
				3137	{
				3138	_data->code += t.type.str + " " + t.str + ";\n";
				3139	}
				3140	}
				3141
				3142	void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
				3143	{
				3144	assert(_data->tiles[name] == nullptr);
				3145	_data->tiles.insert(name, in, dt);
				3146	// Note: A constant does not need to be declared in the code
				3147	}
				3148
				3149	void write_text(const std::string& x) override
				3150	{
				3151	_data->code += x;
				3152	}
				3153
				3154	void compound_statement_begin() override
				3155	{
				3156	_data->tiles.increment_registry_level();
				3157	_data->code += "{\n";
				3158	}
				3159
				3160	void compound_statement_end() override
				3161	{
				3162	_data->tiles.decrement_registry_level();
				3163	_data->code += "}\n";
				3164	}
				3165
				3166	void op_get_global_id(const Operand& dst_var, int32_t dim) override
				3167	{
				3168	assert(dst_var.type() == OperandType::Tile);
				3169	assert(_data->tiles.has_tile(dst_var.value()));
				3170	assert(_data->tiles[dst_var.value()]->format().w == 1 &&
				3171	_data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
				3172
				3173	auto var = _data->tiles[dst_var.value()];
				3174
				3175	_data->code += var->scalar(0, 0).str;
				3176	_data->code += " = get_global_id(";
				3177	_data->code += std::to_string(dim);
				3178	_data->code += ");\n";
				3179	};
				3180
				3181	void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
				3182	{
				3183	OperandUnpacker operands(_data->tiles, _data->arguments);
				3184	auto dst = operands.unpack(o_dst);
				3185	auto step = operands.unpack(o_step);
				3186
				3187	// Validation: Check that x, y and z are scalar
				3188
				3189	TensorOperandUnpacker tensor_operands(_data->arguments);
				3190	auto tensor = tensor_operands.unpack(o_tensor);
				3191	auto gpu_sampler = o_tensor.sampler();
				3192
				3193	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3194
				3195	switch (dim)
				3196	{
				3197	case 0:
				3198	if(mapper.is_one_component_x())
				3199	{
				3200	_data->code += dst->scalar(0, 0).str;
				3201	_data->code += " = 0;\n";
				3202	}
				3203	else
				3204	{
				3205	if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
				3206	{
				3207	// Validation: Check: fixed tensor shape
				3208	// TO BE CHANGED
				3209	_data->code += dst->scalar(0, 0).str;
				3210	_data->code += " = get_global_id(0) * ";
				3211	_data->code += step->scalar(0, 0).str;
				3212	_data->code += ";\n";
				3213	}
				3214	else
				3215	{
				3216	_data->code += dst->scalar(0, 0).str;
				3217	_data->code += " = get_global_id(0) * ";
				3218	_data->code += step->scalar(0, 0).str;
				3219	_data->code += ";\n";
				3220	}
				3221	}
				3222	break;
				3223	case 1:
				3224	if(mapper.is_one_component_y())
				3225	{
				3226	_data->code += dst->scalar(0, 0).str;
				3227	_data->code += " = 0;\n";
				3228	}
				3229	else
				3230	{
				3231	if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
				3232	{
				3233
				3234	}
				3235	else
				3236	{
				3237	_data->code += dst->scalar(0, 0).str;
				3238	_data->code += " = get_global_id(1) * ";
				3239	_data->code += step->scalar(0, 0).str;
				3240	_data->code += ";\n";
				3241	}
				3242	}
				3243	break;
				3244	case 2:
				3245	if(mapper.is_one_component_z())
				3246	{
				3247	_data->code += dst->scalar(0, 0).str;
				3248	_data->code += " = 0;\n";
				3249	}
				3250	else
				3251	{
				3252	_data->code += dst->scalar(0, 0).str;
				3253	_data->code += " = get_global_id(2) * ";
				3254	_data->code += step->scalar(0, 0).str;
				3255	_data->code += ";\n";
				3256	}
				3257	break;
				3258	default:
				3259	break;
				3260	}
				3261	};
				3262
				3263	void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
				3264	{
				3265	OperandUnpacker operands(_data->tiles, _data->arguments);
				3266	auto dst = operands.unpack(o_dst);
				3267
				3268	TensorOperandUnpacker tensor_operands(_data->arguments);
				3269	auto tensor = tensor_operands.unpack(o_tensor);
				3270	auto gpu_sampler = o_tensor.sampler();
				3271
				3272	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3273
				3274	if(mapper.is_one_component_batch())
				3275	{
				3276	_data->code += dst->scalar(0, 0).str;
				3277	_data->code += " = 0;\n";
				3278	}
				3279	else
				3280	{
				3281	std::cout << "Unsupported batched computation" << std::endl;
				3282	assert(false);
				3283	}
				3284	};
				3285
				3286	void op_get_global_size(const Operand& dst_var, int32_t dim) override
				3287	{
				3288	assert(dst_var.type() == OperandType::Tile);
				3289	assert(_data->tiles.has_tile(dst_var.value()));
				3290	assert(_data->tiles[dst_var.value()]->format().w == 1 &&
				3291	_data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
				3292
				3293	auto var = _data->tiles[dst_var.value()];
				3294
				3295	_data->code += var->scalar(0, 0).str;
				3296	_data->code += " = get_global_size(";
				3297	_data->code += std::to_string(dim);
				3298	_data->code += ");\n";
				3299	}
				3300
				3301	void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
				3302	{
				3303	OperandUnpacker operands(_data->tiles, _data->arguments);
				3304	auto lhs = operands.unpack(lhs_name);
				3305	auto rhs = operands.unpack(rhs_name);
				3306	auto dst = operands.unpack(dst_name);
				3307
				3308	const int32_t dst_w = dst->format().w;
				3309	const int32_t dst_h = dst->format().h;
				3310	assert(lhs != nullptr);
				3311	const int32_t lhs_w = lhs->format().w;
				3312	const int32_t rhs_w = rhs->format().w;
				3313
				3314	if(op == BinaryOp::MatMul_Nt_T)
				3315	{
				3316	assert((dst->format().dt == DataType::Fp32) \|\| (dst->format().dt == DataType::Fp16));
				3317	for(int32_t y = 0; y < dst_h; ++y)
				3318	{
				3319	for(int32_t x = 0; x < dst_w; ++x)
				3320	{
				3321	for(int32_t k = 0; k < lhs_w; ++k)
				3322	{
				3323	_data->code += dst->scalar(x, y).str;
				3324	_data->code += " = fma(";
				3325	_data->code += lhs->scalar(k, y).str;
				3326	_data->code += ", ";
				3327	_data->code += rhs->scalar(k, x).str;
				3328	_data->code += ", ";
				3329	_data->code += dst->scalar(x, y).str;
				3330	_data->code += ");\n";
				3331	}
				3332	}
				3333	}
				3334
				3335	return;
				3336	}
				3337
				3338	bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
				3339	bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
				3340
				3341	std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
				3342	std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
				3343	std::string op_str = to_string(op);
				3344
				3345	// Broadcasting on Y is automatic
				3346	for(int32_t y = 0; y < dst_h; ++y)
				3347	{
				3348	_data->code += dst->vector(y).str;
				3349	_data->code += " = ";
				3350	_data->code += lhs_prefix + lhs->vector(y).str;
				3351	_data->code += " ";
				3352	_data->code += op_str;
				3353	_data->code += " ";
				3354	_data->code += rhs_prefix + rhs->vector(y).str;
				3355	_data->code += ";\n";
				3356	}
				3357	};
				3358
				3359	void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
				3360	{
				3361	CKW_UNUSED(policy);
				3362
				3363	OperandUnpacker operands(_data->tiles, _data->arguments);
				3364	auto src = operands.unpack(o_src);
				3365	auto dst = operands.unpack(o_dst);
				3366
				3367	// const int32_t dst_w = dst->format().w;
				3368	const int32_t dst_h = dst->format().h;
				3369	const std::string dt = dst->scalar(0, 0).type.str;
				3370
				3371	// Broadcasting on Y is automatic
				3372	for(int32_t y = 0; y < dst_h; ++y)
				3373	{
				3374	_data->code += dst->vector(y).str;
				3375	_data->code += " = convert_" + dt + "(";
				3376	_data->code += src->vector(y).str;
				3377	_data->code += ");\n";
				3378	}
				3379	};
				3380
				3381	void op_assign(const Operand& dst_name, const Operand& src_name) override
				3382	{
				3383	OperandUnpacker operands(_data->tiles, _data->arguments);
				3384	auto src = operands.unpack(src_name);
				3385	auto dst = operands.unpack(dst_name);
				3386
				3387	const int32_t dst_w = dst->format().w;
				3388	const int32_t dst_h = dst->format().h;
				3389	const int32_t src_w = src->format().w;
				3390	// const int32_t src_h = src->format().h;
				3391	const std::string dt = dst->scalar(0, 0).type.str;
				3392
				3393	bool broadcast_src_x = dst_w != 1 && src_w == 1;
				3394
				3395	std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
				3396
				3397	// Broadcasting on Y is automatic
				3398	for(int32_t y = 0; y < dst_h; ++y)
				3399	{
				3400	_data->code += dst->vector(y).str;
				3401	_data->code += " = ";
				3402	_data->code += src_prefix + src->vector(y).str;
				3403	_data->code += ";\n";
				3404	}
				3405	}
				3406
				3407	void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
				3408	{
				3409	OperandUnpacker operands(_data->tiles, _data->arguments);
				3410	auto src = operands.unpack(src_name);
				3411	auto dst = operands.unpack(dst_name);
				3412
				3413	const int32_t dst_w = dst->format().w;
				3414	const int32_t dst_h = dst->format().h;
				3415	const int32_t src_w = src->format().w;
				3416	// const int32_t src_h = src->format().h;
				3417	const std::string dt = dst->scalar(0, 0).type.str;
				3418
				3419	bool broadcast_src_x = dst_w != 1 && src_w == 1;
				3420
				3421	std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
				3422
				3423	// Broadcasting on Y is automatic
				3424	for(int32_t y = 0; y < dst_h; ++y)
				3425	{
				3426	_data->code += dst->vector(y).str;
				3427	_data->code += " = ";
				3428
				3429	switch(func)
				3430	{
				3431	case ScalarUnaryFunction::Exp:
				3432	_data->code += "exp(";
				3433	break;
				3434
				3435	default:
				3436	CKW_ASSERT(false);
				3437	}
				3438
				3439	_data->code += src_prefix + src->vector(y).str;
				3440	_data->code += ");\n";
				3441	}
				3442	}
				3443
				3444	void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
				3445	{
				3446	OperandUnpacker operands(_data->tiles, _data->arguments);
				3447	auto lhs = operands.unpack(o_lhs);
				3448	auto rhs = operands.unpack(o_rhs);
				3449
				3450	assert(is_tile_scalar(lhs));
				3451	assert(is_tile_scalar(rhs));
				3452
				3453	_data->code += "if(";
				3454	_data->code += lhs->scalar(0, 0).str;
				3455	_data->code += " ";
				3456	_data->code += to_string(op);
				3457	_data->code += " ";
				3458	_data->code += rhs->scalar(0, 0).str;
				3459	_data->code += ")\n";
				3460	}
				3461
				3462	void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
				3463	{
				3464	OperandUnpacker operands(_data->tiles, _data->arguments);
				3465	auto var = operands.unpack(var_name);
				3466	auto cond_value = operands.unpack(cond_value_name);
				3467	auto update_value = operands.unpack(update_value_name);
				3468
				3469	const int32_t dst_w = var->format().w;
				3470	const int32_t dst_h = var->format().h;
				3471
				3472	// It must be a scalar variable
				3473	assert(dst_w == 1);
				3474	assert(dst_h == 1);
				3475
				3476	_data->code += "for(; " ;
				3477	_data->code += var->scalar(0, 0).str;
				3478	_data->code += " ";
				3479	_data->code += to_string(cond_op);
				3480	_data->code += " " + cond_value->scalar(0, 0).str + "; ";
				3481	_data->code += var->scalar(0, 0).str;
				3482	_data->code += " ";
				3483	_data->code += to_string(update_op);
				3484	_data->code += " " + update_value->scalar(0, 0).str + ")";
				3485	_data->code += "\n";
				3486	}
				3487
				3488	void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
				3489	{
				3490	OperandUnpacker operands(_data->tiles, _data->arguments);
				3491	auto dst = operands.unpack(o_dst);
				3492	auto x = operands.unpack(o_x);
				3493	auto y = operands.unpack(o_y);
				3494	auto z = operands.unpack(o_z);
				3495	auto dil_y = operands.unpack(dilation_y);
				3496	auto b = operands.unpack(o_batch_idx);
				3497
				3498	TensorOperandUnpacker tensor_operands(_data->arguments);
				3499	auto tensor = tensor_operands.unpack(o_tensor);
				3500	auto gpu_sampler = o_tensor.sampler();
				3501
				3502	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3503
				3504	auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
				3505
				3506	// Initialize the constant part
				3507	load_writer->initialize(dst, x, z, b);
				3508
				3509	for(int i = 0; i < dst->format().h; ++i)
				3510	{
				3511	std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
				3512	if(dil_y->scalar(0, 0).str != "1")
				3513	{
				3514	coord_y += " * " + dil_y->scalar(0, 0).str;
				3515	}
				3516	load_writer->write(std::make_pair(i, coord_y));
				3517	}
				3518
				3519	load_writer->finalize();
				3520	}
				3521
				3522	void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
				3523	{
				3524	OperandUnpacker operands(_data->tiles, _data->arguments);
				3525	auto dst = operands.unpack(o_dst);
				3526	auto x = operands.unpack(o_x);
				3527	auto y_ind = operands.unpack(o_indirect_h);
				3528	auto z = operands.unpack(o_z);
				3529	auto b = operands.unpack(o_batch_idx);
				3530
				3531	TensorOperandUnpacker tensor_operands(_data->arguments);
				3532	auto tensor = tensor_operands.unpack(o_tensor);
				3533	auto gpu_sampler = o_tensor.sampler();
				3534
				3535	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3536
				3537	auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
				3538
				3539	// Initialize the constant part
				3540	load_writer->initialize(dst, x, z, b);
				3541
				3542	for(int i = 0; i < dst->format().h; ++i)
				3543	{
				3544	load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
				3545	}
				3546
				3547	load_writer->finalize();
				3548	}
				3549
				3550	void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
				3551	{
				3552	OperandUnpacker operands(_data->tiles, _data->arguments);
				3553	auto src = operands.unpack(src_name);
				3554	auto x = operands.unpack(x_name);
				3555	auto y = operands.unpack(y_name);
				3556	auto z = operands.unpack(z_name);
				3557	auto b = operands.unpack(batch_index_name);
				3558
				3559	TensorOperandUnpacker tensor_operands(_data->arguments);
				3560	auto tensor = tensor_operands.unpack(tensor_name);
				3561	auto gpu_sampler = tensor_name.sampler();
				3562
				3563	GpuTensor3dMapper mapper(tensor, gpu_sampler);
				3564
				3565	auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
				3566
				3567	// Initialize the constant part
				3568	store_writer->initialize(src, x, z, b);
				3569
				3570	int32_t tile_h = src->format().h;
				3571
				3572	for(int m0 = tile_h - 1; m0 >= 0; m0--)
				3573	{
				3574	store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
				3575	}
				3576
				3577	store_writer->finalize();
				3578	}
				3579
				3580	void op_return() override
				3581	{
				3582	_data->code += "return;\n";
				3583	}
				3584
				3585	void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
				3586	{
				3587	OperandUnpacker operands(_data->tiles, _data->arguments);
				3588	auto dst = operands.unpack(o_dst);
				3589	auto x = operands.unpack(o_x);
				3590	auto y = operands.unpack(o_y);
				3591	auto x_off = operands.unpack(o_x_off);
				3592	auto y_off = operands.unpack(o_y_off);
				3593
				3594	TensorOperandUnpacker tensor_operands(_data->arguments);
				3595	auto tensor = tensor_operands.unpack(o_tensor);
				3596
				3597	assert(dst->format().w == 1);
				3598	assert(x->format().w == 1);
				3599	assert(y->format().w == 1);
				3600	assert(x_off->format().w == 1);
				3601	assert(y_off->format().w == 1);
				3602	assert(dst->format().dt == DataType::Int32);
				3603	assert(x->format().dt == DataType::Int32);
				3604	assert(y->format().dt == DataType::Int32);
				3605	assert(x_off->format().dt == DataType::Int32);
				3606	assert(y_off->format().dt == DataType::Int32);
				3607
				3608	const std::string width = tensor->component(TensorComponent::W);
				3609	const std::string height = tensor->component(TensorComponent::H);
				3610	const std::string wxh = tensor->component(TensorComponent::WxH);
				3611	/*
				3612	int x_s;
				3613	int y_s;
				3614	x_s = (xi_0 + x_k);
				3615	y_s = (yi_0 + y_k);
				3616	mi_0 = x_s + y_s * width + b * widthxheight;
				3617	mi_0 = select(-1, mi_0, x_s >= 0);
				3618	mi_0 = select(-1, mi_0, y_s >= 0);
				3619	mi_0 = select(-1, mi_0, x_s < 128);
				3620	mi_0 = select(-1, mi_0, y_s < 128);
				3621	*/
				3622	compound_statement_begin();
				3623	declare_tile("_x_s", TileInfo(DataType::Int32));
				3624	declare_tile("_y_s", TileInfo(DataType::Int32));
				3625	auto x_s = operands.unpack(Operand("_x_s"));
				3626	auto y_s = operands.unpack(Operand("_y_s"));
				3627	for(int i = 0; i < dst->format().h; ++i)
				3628	{
				3629	// x_s = (xi_0 + x_k);
				3630	// y_s = (yi_0 + y_k);
				3631	_data->code += x_s->scalar(0, i).str;
				3632	_data->code += " = (";
				3633	_data->code += x->scalar(0, i).str;
				3634	_data->code += " + ";
				3635	_data->code += x_off->scalar(0, i).str;
				3636	_data->code += ");\n";
				3637	_data->code += y_s->scalar(0, i).str;
				3638	_data->code += " = (";
				3639	_data->code += y->scalar(0, i).str;
				3640	_data->code += " + ";
				3641	_data->code += y_off->scalar(0, i).str;
				3642	_data->code += ");\n";
				3643	// mi_0 = x_s + y_s * width;
				3644	_data->code += dst->scalar(0, i).str;
				3645	_data->code += " = ";
				3646	_data->code += x_s->scalar(0, i).str;
				3647	_data->code += " + ";
				3648	_data->code += y_s->scalar(0, i).str;
				3649	_data->code += " * " + width + ";\n";
				3650	// mi_0 = select(wxh, mi_0, x_s >= 0);
				3651	_data->code += dst->scalar(0, i).str;
				3652	_data->code += " = select(-1, ";
				3653	_data->code += dst->scalar(0, i).str;
				3654	_data->code += ", ";
				3655	_data->code += x_s->scalar(0, i).str;
				3656	_data->code += " >= 0);\n";
				3657	// mi_0 = select(wxh, mi_0, y_s >= 0);
				3658	_data->code += dst->scalar(0, i).str;
				3659	_data->code += " = select(-1, ";
				3660	_data->code += dst->scalar(0, i).str;
				3661	_data->code += ", ";
				3662	_data->code += y_s->scalar(0, i).str;
				3663	_data->code += " >= 0);\n";
				3664	// mi_0 = select(wxh, mi_0, x_s < width);
				3665	_data->code += dst->scalar(0, i).str;
				3666	_data->code += " = select(-1, ";
				3667	_data->code += dst->scalar(0, i).str;
				3668	_data->code += ", ";
				3669	_data->code += x_s->scalar(0, i).str;
				3670	_data->code += " < ";
				3671	_data->code += width + ");\n";
				3672	// mi_0 = select(wxh, mi_0, y_s < height);
				3673	_data->code += dst->scalar(0, i).str;
				3674	_data->code += " = select(-1, ";
				3675	_data->code += dst->scalar(0, i).str;
				3676	_data->code += ", ";
				3677	_data->code += y_s->scalar(0, i).str;
				3678	_data->code += " < ";
				3679	_data->code += height + ");\n";
				3680	}
				3681	compound_statement_end();
				3682	}
				3683
				3684	private:
				3685	GpuKernelWriterDataHolder* _data { nullptr };
				3686	GpuKernelWriterAttribute * _attr { nullptr };
				3687	};
				3688
				3689	/** IGpuKernelWriter factory class */
				3690	class GpuKernelWriterFactory final
				3691	{
				3692	public:
				3693	/** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
				3694	*
				3695	* @param[in] gpu GPU target
				3696	*
				3697	* @return IGpuKernelWriter
				3698	*/
				3699	static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute attr, GpuKernelWriterDataHolder x)
				3700	{
				3701	switch(x->programming_language())
				3702	{
				3703	case GpuTargetLanguage::OpenCL:
				3704	return std::make_unique<ClKernelWriter>(attr, x);
				3705	default:
				3706	std::cout << "Unsupported Gpu programming language" << std::endl;
				3707	assert(false);
				3708	}
				3709	}
				3710	};
				3711
				3712	inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
				3713	{
				3714	auto tensor = tensor_info_id->shape;
				3715
				3716	int32_t dim[3] = {0};
				3717
				3718	switch(tensor_format)
				3719	{
				3720	case TensorSamplerFormat::C_W_H:
				3721	dim[0] = tensor[0];
				3722	dim[1] = tensor[1];
				3723	dim[2] = tensor[2];
				3724	break;
				3725	case TensorSamplerFormat::C_WH_1:
				3726	dim[0] = tensor[0];
				3727	dim[1] = tensor[1] * tensor[2];
				3728	dim[2] = 1;
				3729	break;
				3730	default:
				3731	std::cout << "Unsupported tensor format" << std::endl;
				3732	assert(false);
				3733	break;
				3734	}
				3735
				3736	return std::min(step, dim[idx]);
				3737	}
				3738
				3739	} // namespace prototype
				3740	} // namespace ckw
				3741
				3742	#endif // CKW_SRC_PROTOTYPE_H