compute_kernel_writer/prototype/src/Prototype.h - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
 #define CKW_PROTOTYPE_SRC_PROTOTYPE_H

 #include <vector>
 #include <map>
 #include <string>
 #include <cstdint> // int32_t
 #include <iostream> // cout (to be removed)
 #include <cassert>  // assert (to be removed)
 #include <unordered_map>
 #include <chrono>
 #include <cmath>
 #include <memory>
 #include <algorithm>
 #include <array>
 #include <stdexcept>

 #include "ckw/Types.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/Error.h"

 namespace ckw
 {
 namespace prototype {

 // Dummy data structure for Size2D
 using Size2D = std::vector<int32_t>;

 // Dummy Status
 using Status = void;

 enum class ComponentType : int32_t
 {
     Complex   = 0,
     Simple    = 1,
     Unfusable = 2
 };

 enum class GpuCompilationSpeed
 {
     Fast   = 0x00,  // fast compilation may increase the latency of the network
     Slow   = 0x01   // slow compilation may decrease the latency of the network
 };

 enum class GpuExtensions
 {
     Fp16,
     Dot8,
     Mmul,
     FastMath
 };

 struct TensorInfo
 {
     TensorShape shape { {0} };
     DataType    data_type { DataType::Unknown };
     TensorDataLayout  data_layout { TensorDataLayout::Nhwc };
     int32_t     id { -1 };
 };

 struct ComponentAttribute
 {
     GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
     bool                overwrite_tile { true };
 };

 inline std::string data_type_to_cl_type(DataType dt)
 {
     switch(dt)
     {
         case DataType::Fp32:
             return "float";
         case DataType::Fp16:
             return "half";
         case DataType::Int8:
             return "char";
         case DataType::Uint8:
             return "uchar";
         case DataType::Uint16:
             return "ushort";
         case DataType::Int16:
             return "short";
         case DataType::Uint32:
             return "uint";
         case DataType::Int32:
             return "int";
         case DataType::Bool:
             return "bool";
         default:
             assert(false);
             return "";
     }
 }

 inline int32_t width_to_cl_vector_size(int32_t width)
 {
     switch(width)
     {
         case 1:
             return 1;
         case 2:
             return 2;
         case 3:
             return 3;
         case 4:
             return 4;
         case 5:
         case 6:
         case 7:
         case 8:
             return 8;
         case 9:
         case 10:
         case 11:
         case 12:
         case 13:
         case 14:
         case 15:
         case 16:
             return 16;
         default:
             assert(false);
             return 0;
     }
 }

 inline std::string get_cl_data_type(DataType dt, int32_t width)
 {
     std::string data_type;
     int32_t w = width_to_cl_vector_size(width);
     data_type += data_type_to_cl_type(dt);
     if(w != 1)
     {
         data_type += std::to_string(w);
     }
     return data_type;
 }

 inline std::string to_opencl_store(int32_t vector_length)
 {
     if(vector_length != 1)
     {
         return "vstore" + std::to_string(vector_length) + "(";
     }
     else
     {
         return "*(";
     }
 }

 struct TileInfo
 {
     TileInfo() {}
     TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
     TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
     TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
     DataType dt{ DataType::Unknown }; // Data type of the tile
     int32_t  w{ 0 };  // Width (i.e. c0 - portion of the channels)
     int32_t  h{ 0 };  // Height (i.e. s0 - portion of the spatial dimensions)
 };

 inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
 {
     o << a.w << " x " << a.h;
     return o;
 }

 struct DataTypeAsString
 {
     std::string str { "" };
     DataType    dt { DataType::Unknown };
     int32_t     size { 1 };
 };

 struct ValueAsString
 {
     std::string      str { "" };
     DataTypeAsString type { };
 };

 // https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
 // A Tile is a collection of variables used to express a 2D data.
 class IScalarTile
 {
 public:
     virtual ~IScalarTile() = default;
     /** Method to get the scalar variable from a tile
      * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
      * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
      *
      * @return the scalar variable as a string
      */
     virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
     /** Method to get the list of underlying variable names used by the tile
      *
      * @return the list of variable names
      */
     virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
     /** Method to get the name of the tile.
      *
      * @return the name of the tile
      */
     std::string name() const
     {
         return _basename;
     }
     /** Method to get the tile format
      *
      * @return the format
      */
     TileInfo format() const
     {
         return _format;
     }
     /** Method to know whether the tile is assignable or not (constant)
      *
      * @return true if the tile is assignable
      */
     virtual bool is_assignable() const = 0;
     /** Method to know whether the tile needs to be declared
      *
      * @return true if the tile needs to be declared in the code before being used
      */
     virtual bool need_declaration() const = 0;
 protected:
     TileInfo  _format { };       // Tile format
     std::string _basename { "" };  // Tile name
 };

 // A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
 // The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
 class IVectorTile : public IScalarTile
 {
 public:
     virtual ~IVectorTile() = default;
     /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
      *  The user can query the list of supported width for the vectors through preferred_vector_sizes().
      *
      * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
      *
      * @return the vector variable as a string
      */
     virtual ValueAsString vector(int32_t y) const = 0;
     /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
      *
      * @return the vector variable as a string
      */
     virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
     /** Method to get the preferred vector sizes.
      *
      * @return a vector with the preferred vector sizes
      */
     //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
 };

 class ClTile : public IVectorTile
 {
 public:
     ClTile(const std::string& name, TileInfo format)
     {
         _format = format;
         _basename = name;
     }

     ValueAsString scalar(int32_t x, int32_t y) const override
     {
         x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         ValueAsString t;
         t.str       = build_variable_name(y);
         t.type.str  = get_cl_data_type(_format.dt, 1);
         t.type.dt   = _format.dt;
         t.type.size = 1;

         // Check required because if the width has only one element, we cannot use .s0
         if(_format.w != 1)
         {
             // Automatic broadcasting
             t.str += ".s" + std::to_string(x);
         }

         return t;
     }

     ValueAsString vector(int32_t y) const override
     {
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         ValueAsString t;
         t.str       = build_variable_name(y);
         t.type.str  = get_cl_data_type(_format.dt, _format.w);
         t.type.dt   = _format.dt;
         t.type.size = _format.w;
         return t;
     }

     ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
     {
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         ValueAsString t;
         t.str       = build_variable_name(y);
         t.type.str  = get_cl_data_type(_format.dt, width);
         t.type.dt   = _format.dt;
         t.type.size = width;

         if(_format.w != 1)
         {
             t.str += ".s";
             for(int i = 0; i < width; ++i)
             {
                 t.str += to_scalar_hex(x_start + i);
             }
         }
         return t;
     }

     std::vector<ValueAsString> underlying_source_variables() const override
     {
         std::vector<ValueAsString> vars;
         for(int32_t y = 0; y < _format.h; ++y)
         {
             ValueAsString t;
             t.str       = build_variable_name(y);
             t.type.str  = get_cl_data_type(_format.dt, _format.w);
             t.type.dt   = _format.dt;
             t.type.size = _format.w;
             vars.push_back(t);
         }
         return vars;
     }

     bool is_assignable() const override
     {
         return true;
     }

     bool need_declaration() const override
     {
         return true;
     }

 private:
     std::string build_variable_name(int32_t y) const
     {
         std::string var_name = _basename;

         if(_format.h == 1)
         {
             return var_name;

         }
         else
         {
             var_name += "_";
             var_name += std::to_string(y);
         }

         return var_name;
     }

     std::string to_scalar_hex(int32_t x) const
     {
         switch(x)
         {
             case 0:
             case 1:
             case 2:
             case 3:
             case 4:
             case 5:
             case 6:
             case 7:
             case 8:
             case 9:
                 return std::to_string(x);
             case 10:
                 return "A";
             case 11:
                 return "B";
             case 12:
                 return "C";
             case 13:
                 return "D";
             case 14:
                 return "E";
             case 15:
                 return "F";
             default:
                 std::cout << "Unsupported hexadecimal value" << std::endl;
                 assert(false);
                 return "";
         }
     }
 };

 // Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
 class ClConstantTile : public IVectorTile
 {
 public:
     ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
     {
         _format.w  = in[0].size();
         _format.h  = in.size();
         _format.dt = dt;

         _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));

         for(int32_t y = 0; y < _format.h; ++y)
         {
             for(int32_t x = 0; x < _format.w; ++x)
             {
                 _data[y][x] = in[y][x];
             }
         }
     }

     ValueAsString scalar(int32_t x, int32_t y) const override
     {
         x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         ValueAsString t;
         t.str       = _data[y][x];
         t.type.str  = get_cl_data_type(_format.dt, 1);
         t.type.dt   = _format.dt;
         t.type.size = 1;

         return t;
     }

     ValueAsString vector(int32_t y) const override
     {
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         return vector(0, _format.w, y);
     }

     ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
     {
         y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));

         ValueAsString t;
         t.str       = "";
         t.type.str  = get_cl_data_type(_format.dt, width);
         t.type.dt   = _format.dt;
         t.type.size = width;

         if(width > 1)
         {
             t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
         }

         int32_t x = x_start;
         for(; x < width - 1; ++x)
         {
             t.str += scalar(x, y).str;
             t.str += ", ";
         }
         t.str += scalar(x, y).str;

         if(width > 1)
         {
             t.str += "))";
         }

         return t;
     }

     std::vector<ValueAsString> underlying_source_variables() const override
     {
         std::vector<ValueAsString> vars;

         for(int32_t y = 0; y < _format.h; ++y)
         {
             for(int32_t x = 0; x < _format.w; ++x)
             {
                 ValueAsString t;
                 t.str       = _data[y][x];
                 t.type.str  = get_cl_data_type(_format.dt, 1);
                 t.type.dt   = _format.dt;
                 t.type.size = 1;
                 vars.push_back(t);
             }
         }

         return vars;
     }

     bool is_assignable() const override
     {
         return false;
     }

     bool need_declaration() const override
     {
         return false;
     }

 private:
     std::vector<std::vector<std::string>> _data{};
 };

 enum class TensorComponentIndex : int32_t
 {
     IndexMask = 0x0000000f,
 };

 enum class TensorComponentType : int32_t
 {
     OffsetFirstElement = 0x00000100,
     Stride             = 0x00001000,
     Dimension          = 0x00010000,
     FoldedDimension    = 0x00100000,
     Constant           = 0x01000000
 };

 enum class TensorComponent : int32_t
 {
     Unknown             = 0x00000000,
     OffsetFirstElement  = 0x00000100,
     Stride1             = 0x00001001,
     Stride2             = 0x00001002,
     Stride3             = 0x00001003,
     Stride4             = 0x00001004,
     Dim0                = 0x00010000,
     Dim1                = 0x00010001,
     Dim2                = 0x00010002,
     Dim3                = 0x00010003,
     Dim4                = 0x00010004,
     C                   = 0x00010000,   // Dim0
     W                   = 0x00010001,   // Dim1
     H                   = 0x00010002,   // Dim2
     D                   = 0x00010003,
     N                   = 0x00010004,
     Dim1xDim2           = 0x00100021,
     Dim1xDim2xDim3      = 0x00100321,
     WxH                 = 0x00100021,
     WxHxD               = 0x00100321
 };

 inline std::string to_string(TensorComponent x)
 {
     switch(x)
     {
         case TensorComponent::Unknown:
         return "Unknown";
         case TensorComponent::OffsetFirstElement:
         return "OffsetFirstElement";
         case TensorComponent::Stride1:
         return "Stride1";
         case TensorComponent::Stride2:
         return "Stride2";
         case TensorComponent::Stride3:
         return "Stride3";
         case TensorComponent::Stride4:
         return "Stride4";
         case TensorComponent::Dim0:
         return "Dim0";
         case TensorComponent::Dim1:
         return "Dim1";
         case TensorComponent::Dim2:
         return "Dim2";
         case TensorComponent::Dim3:
         return "Dim3";
         case TensorComponent::Dim4:
         return "Dim4";
         case TensorComponent::Dim1xDim2:
         return "Dim1xDim2";
         case TensorComponent::Dim1xDim2xDim3:
         return "Dim1xDim2xDim3";
         default:
         assert(false);
     }
 }

 class ITensorArgument
 {
 public:
     virtual ~ITensorArgument() = default;
     /** Method to get the tensor component as a string
      *
      * @param[in] x tensor component to query
      *
      * @return  the tensor component as a string
      */
     virtual std::string component(TensorComponent x) = 0;
     /** Method to get the tensor component type declaration as a string
      *
      * @return  the tensor component type declaration as a string
      */
     virtual std::string component_type_declaration() const = 0;
     /** Method to get the tensor component data type
      *
      * @return  the tensor component data type
      */
     virtual DataType component_data_type() const = 0;
     /** Method to get the tensor component declarations
      *
      * @return a vector containing the tensor component declarations
      */
     virtual std::vector<TensorComponent> component_declarations() const = 0;
     /** Method to get the name of the tensor argument.
      *
      * @return the name of the tensor argument
      */
     std::string name() const
     {
         return _basename;
     }
     /** Method to get the tensor format
      *
      * @return the format
      */
     TensorInfo format() const
     {
         return _format;
     }

 protected:
     TensorInfo   _format { };
     std::string  _basename {};
 };

 enum class GpuTensorStorage : int32_t
 {
     Unknown          = 0x0000,
     BufferUint8Ptr   = 0x0012,
     Image2dReadOnly  = 0x0020,
     Image2dWriteOnly = 0x0021,
     Image3dReadOnly  = 0x0030,
     Image3dWriteOnly = 0x0031
 };

 class IGpuTensorArgument : public ITensorArgument
 {
 public:
     virtual ~IGpuTensorArgument() = default;
     /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
      *
      * @param[in] x tensor storage to query
      *
      * @return  the tensor storage as a string
      */
     virtual std::string storage(GpuTensorStorage x) = 0;
     /** Method to get the tensor storage type declaration as a string
      *
      * @param[in] x tensor component to query
      *
      * @return  the tensor storage type declaration as a string
      */
     virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
     /** Method to get the tensor storage declarations
      *
      * @return a vector containing the tensor storage declarations
      */
     virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
 };

 class ClTensorArgument : public IGpuTensorArgument
 {
 public:
     ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
     {
         _basename = name;
         _format   = x;
         _return_by_value_when_possible = return_by_value_when_possible;
     }

     // Methods to override
     std::string component(TensorComponent x) override
     {
         if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
         {
             int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
             return std::to_string(idx - 1);
         }

         if(_return_by_value_when_possible)
         {
             if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
             {
                 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
                 return std::to_string(_format.shape[idx]);
             }

             if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
             {
                 switch(x)
                 {
                     case TensorComponent::Dim1xDim2:
                     return std::to_string(_format.shape[1] * _format.shape[2]);
                     case TensorComponent::Dim1xDim2xDim3:
                     return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
                     default:
                     std::cout << "Unsupported folded dimension" << std::endl;
                     assert(false);
                 }
             }
         }

         if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
         {
             _components_required.push_back(x);
         }

         return build_component_name(x);
     }

     std::string component_type_declaration() const override
     {
         return "int";
     };

     DataType component_data_type() const override
     {
         return DataType::Int32;
     }

     std::string storage(GpuTensorStorage x) override
     {
         if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
         {
             _storage_required.push_back(x);
         }

         return build_storage_name(x);
     }

     std::string storage_type_declaration(GpuTensorStorage x) const override
     {
         switch(x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return "__global uchar*";
             case GpuTensorStorage::Image2dReadOnly:
                 return "__read_only image2d_t";
             case GpuTensorStorage::Image2dWriteOnly:
                 return "__write_only image2d_t";
             case GpuTensorStorage::Image3dReadOnly:
                 return "__read_only image3d_t ";
             case GpuTensorStorage::Image3dWriteOnly:
                 return "__write_only image3d_t ";
             default:
                 std::cout << "Unsupported storage" << std::endl;
                 assert(false);
                 return "";
         }
     };

     std::vector<GpuTensorStorage> storage_declarations() const override
     {
         return _storage_required;
     }

     std::vector<TensorComponent> component_declarations() const override
     {
         return _components_required;
     }

 private:
     std::string build_storage_name(GpuTensorStorage x) const
     {
         std::string var_name = _basename;

         switch(x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return var_name + "_ptr";
             case GpuTensorStorage::Image2dReadOnly:
             case GpuTensorStorage::Image2dWriteOnly:
                 return var_name + "_img2d";
             case GpuTensorStorage::Image3dReadOnly:
             case GpuTensorStorage::Image3dWriteOnly:
                 return var_name + "_img3d";
             default:
                 std::cout << "Unsupported storage" << std::endl;
                 assert(false);
         }

         return var_name;
     }

     std::string build_component_name(TensorComponent x) const
     {
         std::string var_name = _basename;

         switch(x)
         {
             case TensorComponent::OffsetFirstElement:
                 return var_name + "_offset_first_element";
             case TensorComponent::Stride1:
                 return var_name + "_stride1";
             case TensorComponent::Stride2:
                 return var_name + "_stride2";
             case TensorComponent::Stride3:
                 return var_name + "_stride3";
             case TensorComponent::Dim0:
                 return var_name + "_dim0";
             case TensorComponent::Dim1:
                 return var_name + "_dim1";
             case TensorComponent::Dim2:
                 return var_name + "_dim2";
             case TensorComponent::Dim3:
                 return var_name + "_dim3";
             case TensorComponent::Dim1xDim2:
                 return var_name + "_dim1xdim2";
             case TensorComponent::Dim1xDim2xDim3:
                 return var_name + "_dim1xdim2xdim3";
             default:
                 std::cout << "Unsupported component" << std::endl;
                 assert(false);
         }

         return var_name;
     }

     bool                          _return_by_value_when_possible { false };
     std::vector<GpuTensorStorage> _storage_required {};
     std::vector<TensorComponent>  _components_required {};
 };

 /**
  * @brief Data structure that contains the declared tiles by the components.
  * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
  * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
  * and remove (pop) all the tiles from the level above.
  * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
  * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
  * when declaring tiles among different components.
  *
  */
 class GpuTileRegistry
 {
 public:
 enum class RegistryTileType
 {
     Tile,
     Link
 };

 using RegistryIdSpace            = int32_t;
 using RegistryLevel              = int32_t;
 using RegistryTileName           = std::string;

 struct RegistryTileTableEntry
 {
     RegistryLevel              registry_level { 0 };
     std::unique_ptr<IVectorTile>  tile_object { nullptr };
 };

 struct RegistryTileTypeTableEntry
 {
     RegistryTileType           tile_type { RegistryTileType::Tile };
     RegistryTileName           tile_name {};
     RegistryIdSpace            registry_idspace { 0 };
     RegistryLevel              registry_level { 0 };
 };

 using RegistryTileTable      = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
 using RegistryTileTypeTable  = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
     /**
      * @brief Construct a new Gpu Tile Registry object
      *
      */
     GpuTileRegistry()
     {
         _language = GpuTargetLanguage::Unknown;
     }
     /**
      * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
      *
      * @param[in] language Gpu programming language to use
      */
     GpuTileRegistry(GpuTargetLanguage language)
     {
         _language = language;
     }
     /**
      * @brief Default destructor. Destroy the Gpu Tile Registry object
      *
      */
     ~GpuTileRegistry() = default;
     /**
      * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
      *        Therefore, the IdSpace should be set before declaring any tiles.
      *
      * @param[in] id The IdSpace id
      */
     void set_IdSpace(int32_t id)
     {
         _IdSpace = id;
     }
     /**
      * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
      *
      * @return The IdSpace id
      */
     int32_t IdSpace() const
     {
         return _IdSpace;
     }
     /**
      * @brief Gets all the IdSpace declarations defined in the tile registry.
      *
      * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
      */
     std::vector<int32_t> IdSpace_declarations() const
     {
         std::vector<int32_t> x;

         auto it = _frags.begin();

         while (it != _frags.end())
         {
             x.push_back(it->first);

             it++;
         }

         return x;
     }
     /**
      * @brief Declare a tile from a previously created tile
      */
     void insert(const std::string& name, const IVectorTile *frag)
     {
         assert(_language == GpuTargetLanguage::OpenCL);
         const int32_t     key_IdSpace   = _IdSpace;
         const std::string key_var_name  = name;
         const std::string var_name      = frag->name();
         TileInfo        format        = frag->format();

         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
         if(result == nullptr)
         {
             std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);

             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
             _frags[key_IdSpace][key_var_name].registry_level = _registry_level;

             _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Link;
             _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
             _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
             _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
         }
     }
     /**
      * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
      *
      * @note The reference name used for declaring the tile should not be previously used in the IdSpace
      *
      * @param[in] name   Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
      * @param[in] format Tile format use to use
      */
     void insert(const std::string& name, const TileInfo& format)
     {
         assert(_language == GpuTargetLanguage::OpenCL);
         const int32_t     key_IdSpace   = _IdSpace;
         const std::string key_var_name  = name;
         const std::string var_name      = generate_tile_name(name);

         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
         if(result == nullptr)
         {
             std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
             _frags[key_IdSpace][key_var_name].registry_level = _registry_level;

             _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
             _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
             _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
             _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
         }
     }
     /**
      * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
      *
      * @note The reference name used for declaring the tile should not be previously used in the IdSpace
      *
      * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
      * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
      * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
      *                 that the data type is aligned with the content of the std::string.
      */
     void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
     {
         assert(_language == GpuTargetLanguage::OpenCL);
         const int32_t     key_IdSpace  = _IdSpace;
         const std::string key_var_name = name;

         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
         if(result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
             _frags[key_IdSpace][key_var_name].registry_level = _registry_level;

             _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
             _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
             _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
             _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
         }
     }
     /**
      * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
      *
      * @note This method can be used to declare temporary tiles that need to be accessed only once.
      *
      * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
      * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
      *                 that the data type is aligned with what passed with the std::string.
      *
      * @return IVectorTile* the anonymous constant tile
      */
     IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
     {
         assert(_language == GpuTargetLanguage::OpenCL);
         const int32_t     key_IdSpace = _IdSpace;
         const std::string key_var_name  = "_" + std::to_string(_anonymous_frag_count++);

         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
         if(result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
             _frags[key_IdSpace][key_var_name].registry_level = _registry_level;

             _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
             _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
             _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
             _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
         }

         return (*this)[key_var_name];
     }
     /**
      * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
      *
      * @param[in] name         The name of the tile to retrieve
      * @param[in] IdSpace The IdSpace id where to search the tile
      *
      * @return IVectorTile* The tile
      */
     IVectorTile* get(const std::string& name, int32_t IdSpace)
     {
         const int32_t     key_IdSpace = IdSpace;
         const std::string key_var_name  = name;

         IVectorTile* result = nullptr;
         auto search_IdSpace = _frags.find(key_IdSpace);
         if(search_IdSpace != _frags.end())
         {
             auto search_tile = _frags[key_IdSpace].find(key_var_name);
             if(search_tile != _frags[key_IdSpace].end())
             {
                 result = search_tile->second.tile_object.get();
                 assert(result != nullptr);
             }
         }

         return result;
     }
     /**
      * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
      *
      * @param[in] name The name of the tile to retrieve
      *
      * @return IVectorTile* The tile
      */
     IVectorTile* operator[](const std::string& name)
     {
         return get(name, _IdSpace);
     }
     /**
      * @brief Check whether the tile in the in the IdSpace provided by the user exists
      *
      * @param[in] name         Name of the tile to search for
      * @param[in] IdSpace The IdSpace id where to search the tile
      *
      * @return true if the tile exists
      * @return false if the tile does not exist
      */
     bool has_tile(const std::string& name, int32_t IdSpace) const
     {
         const int32_t     key_IdSpace = IdSpace;
         const std::string key_var_name  = name;

         // IVectorTile* result = nullptr;
         auto search_IdSpace = _frags.find(key_IdSpace);

         return search_IdSpace != _frags.end();
     }
     /**
      * @brief Check whether the tile within the current IdSpace exists
      *
      * @param[in] name Name of the tile to search for
      *
      * @return true if the tile exists
      * @return false if the tile does not exist
      */
     bool has_tile(const std::string& name) const
     {
         return has_tile(name, _IdSpace);
     }
     /**
      * @brief Get all the tiles declared within the IdSpace provided by the user
      *
      * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
      *
      * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
      */
     std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
     {
         std::vector<IVectorTile*> tiles;

         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();

         while (it != _frag_types[IdSpace].end())
         {
             // The following line should be enabled. However, we cannot at this stage
             // because it used to retrieve the output tile produced by each component.
             // However, this method should NOT be used to retrieve the output tile
             //if(it->second.tile_type == RegistryTileType::Tile)
             {
                 tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
             }
             it++;
         }

         return tiles;
     }
     /**
      * @brief Increase the level of stack.
      *
      */
     void increment_registry_level()
     {
         _registry_level++;
     }
     /**
      * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
      *
      */
     void decrement_registry_level()
     {
         assert(_registry_level >= 0);

         // Remove all variables in the local scope
         std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();

         while (it != _frags[_IdSpace].end())
         {
             if (it->second.registry_level == _registry_level)
             {
                 it = _frags[_IdSpace].erase(it);
             }
             else
             {
                 it++;
             }
         }

         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();

         while (it_type != _frag_types[_IdSpace].end())
         {
             if (it_type->second.registry_level == _registry_level)
             {
                 it_type = _frag_types[_IdSpace].erase(it_type);
             }
             else
             {
                 it_type++;
             }
         }

         _registry_level--;
     }
     /**
      * @brief Get the level of the stack
      *
      */
     int32_t level() const
     {
         return _registry_level;
     }

 private:
     // This method ensures that the key is unique among different components
     std::string generate_tile_name(const std::string& name)
     {
         assert(_IdSpace >= 0 );
         if(_registry_level == 0)
         {
             return "_G" + std::to_string(_IdSpace) + "_" + name;
         }
         else
         {
             return name;
         }
     }
     RegistryTileTable     _frags {};
     RegistryTileTypeTable _frag_types {};
     RegistryLevel         _registry_level { 0 };
     RegistryIdSpace       _IdSpace { -1 };
     int32_t               _anonymous_frag_count { 0 };    // Counter used to create the anonymous tiles
     GpuTargetLanguage     _language { GpuTargetLanguage::Unknown }; // Gpu programming language
 };

 using TensorEntry = std::unique_ptr<IGpuTensorArgument>;

 /**
  * @brief Data structure that contains the tensors consumed by the components.
  * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
  * when declaring tensors among different components.
  *
  */
 class GpuTensorArgumentRegistry
 {
 public:
     /**
      * @brief Construct a new Gpu Tensor Registry object
      *
      */
     GpuTensorArgumentRegistry()
     {
         _language = GpuTargetLanguage::Unknown;
     }
     /**
      * @brief Construct a new Gpu Tensor Registry object
      *
      * @param[in] language Gpu programming language to use
      */
     GpuTensorArgumentRegistry(GpuTargetLanguage language)
     {
         _language = language;
     }
     /**
      * @brief Default destructor. Destroy the Gpu Tensor Registry object
      *
      */
     ~GpuTensorArgumentRegistry() = default;
     /**
      * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
      *        Therefore, the IdSpace should be set before declaring any tensors.
      *
      * @param[in] id The IdSpace id
      */
     void set_IdSpace(int32_t id)
     {
         _IdSpace = id;
     }
     /**
      * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
      *
      * @return The IdSpace id
      */
     int32_t IdSpace() const
     {
         return _IdSpace;
     }
     /**
      * @brief Gets all the IdSpace declarations defined in the tensor registry.
      *
      * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
      */
     std::vector<int32_t> IdSpace_declarations() const
     {
         std::vector<int32_t> x;

         auto it = _refs.begin();

         while (it != _refs.end())
         {
             x.push_back(it->first);

             it++;
         }

         return x;
     }
     /**
      * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
      *
      * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
      *
      * @param[in] name                          Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
      * @param[in] x                             Pair of tensor info and tensor id
      * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
      */
     void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
     {
         assert(_language == GpuTargetLanguage::OpenCL);
         const int32_t     key_IdSpace    = _IdSpace;
         const int32_t     tensor_id      = x.id;
         const std::string key_var_name   = name;
         const std::string var_name       = generate_tensor_name(name, tensor_id);

         // First, check whether the tensor has already a reference. If so, trigger an assert
         assert(!has_tensor_argument(name));

         // Check whether a tensor with that tensorID exists
         auto result = _tensor_arguments.find(tensor_id);
         if(result == _tensor_arguments.end())
         {
             // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
             std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
             _tensor_arguments[tensor_id] = std::move(arg);
         }

         _refs[key_IdSpace][key_var_name] = tensor_id;
     }
     /**
      * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
      *
      * @param[in] name The name of the tensor to retrieve
      *
      * @return IGpuTensor* The tensor
      */
     IGpuTensorArgument* operator[](const std::string& name)
     {
         const int32_t     key_IdSpace = _IdSpace;
         const std::string key_var_name  = name;

         IGpuTensorArgument* result = nullptr;
         auto search_IdSpace = _refs.find(key_IdSpace);
         if(search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);

             if(search_tensor_id != _refs[key_IdSpace].end())
             {
                 const int32_t tensor_id = search_tensor_id->second;
                 auto search_tensor_argument = _tensor_arguments.find(tensor_id);
                 if(search_tensor_argument != _tensor_arguments.end())
                 {
                     result = search_tensor_argument->second.get();
                 }
                 assert(result != nullptr);
             }
         }

         return result;
     }
     /**
      * @brief Get all the tensors declared in the IdSpace provided by the user
      *
      * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
      */
     std::vector<IGpuTensorArgument*> tensor_argument_declarations()
     {
         std::vector<IGpuTensorArgument*> args;

         auto it = _tensor_arguments.begin();

         while (it != _tensor_arguments.end())
         {
             args.push_back(it->second.get());
             it++;
         }

         return args;
     }
     /**
      * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
      *
      * @param[in] name Name of the tensor argument to search for
      *
      * @return true if the tensor argument exists
      * @return false if the tensor argument does not exist
      */
     bool has_tensor_argument(const std::string& name)
     {
         const int32_t     key_IdSpace = _IdSpace;
         const std::string key_var_name  = name;

         auto search_IdSpace = _refs.find(key_IdSpace);

         if(search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);

             return search_tensor_id != _refs[key_IdSpace].end();
         }
         else
         {
             return false;
         }
     }
     /**
      * @brief Check whether the tensor argument is in the the IdSpace provided by the user
      *
      * @param[in] name    Name of the tensor argument to search for
      * @param[in] IdSpace The IdSpace id where to search the tensor argument
      *
      * @return true if the tile exists
      * @return false if the tile does not exist
      */
     bool has_tensor_argument(const std::string& name, int32_t IdSpace)
     {
         const int32_t     key_IdSpace = IdSpace;
         const std::string key_var_name  = name;

         auto search_IdSpace = _refs.find(key_IdSpace);

         if(search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);

             return search_tensor_id != _refs[key_IdSpace].end();
         }
         else
         {
             return false;
         }
     }
 private:
     // This method ensures that the key is unique among different components
     std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
     {
         assert(tensor_id >= 0 );

         return name + std::to_string(tensor_id);
     }

     std::map<int32_t, TensorEntry>                    _tensor_arguments {};
     std::map<int32_t, std::map<std::string, int32_t>> _refs {};
     int32_t                                           _IdSpace { -1 };
     GpuTargetLanguage                                 _language { GpuTargetLanguage::Unknown }; // Gpu programming language
 };

 enum class OpType : int32_t
 {
     Elementwise = 0x0000,
     Relational  = 0x1000,
     Algebra     = 0x2000
 };

 inline std::string to_string(AssignmentOp op)
 {
     switch(op)
     {
         case AssignmentOp::Decrement:
             return "-=";
         case AssignmentOp::Increment:
             return "+=";
         default:
             assert(false);
             return "";
     }
 }

 inline std::string to_string(BinaryOp op)
 {
     switch(op)
     {
         case BinaryOp::Add:
             return "+";
         case BinaryOp::Sub:
             return "-";
         case BinaryOp::Mul:
             return "*";
         case BinaryOp::Div:
             return "/";
         case BinaryOp::Mod:
             return "%";
         case BinaryOp::Equal:
             return "==";
         case BinaryOp::Less:
             return "<";
         case BinaryOp::LessEqual:
             return "<=";
         case BinaryOp::Greater:
             return ">";
         case BinaryOp::GreaterEqual:
             return ">=";
         case BinaryOp::LogicalAnd:
             return "&&";
         case BinaryOp::LogicalOr:
             return "||";
         case BinaryOp::LogicalNot:
             return "!";
         default:
             assert(false);
             return "";
     }
 }

 inline std::string binary_op_string(BinaryOp op)
 {
     switch(op)
     {
         case BinaryOp::Add:
             return "add";
         case BinaryOp::Sub:
             return "sub";
         case BinaryOp::Mul:
             return "mul";
         case BinaryOp::Div:
             return "div";
         case BinaryOp::Mod:
             return "mod";
         case BinaryOp::Equal:
             return "eq";
         case BinaryOp::Less:
             return "gt";
         case BinaryOp::LessEqual:
             return "gteq";
         case BinaryOp::Greater:
             return "lt";
         case BinaryOp::GreaterEqual:
             return "lte";
         default:
             assert(false);
             return "";
     }
 }

 enum class OperandType : int32_t
 {
     Unknown              = 0x00000000,
     ScalarFp32           = 0x00001011, // Immediate scalar tile
     ScalarFp16           = 0x00001012, // Immediate scalar tile
     ScalarInt32          = 0x00001021, // Immediate scalar tile
     ScalarInt16          = 0x00001022, // Immediate scalar tile
     ScalarInt8           = 0x00001024, // Immediate scalar tile
     ScalarUInt32         = 0x00001031, // Immediate scalar tile
     ScalarUInt16         = 0x00001032, // Immediate scalar tile
     ScalarUInt8          = 0x00001034, // Immediate scalar tile
     ScalarBool           = 0x00001041, // Immediate scalar tile
     ScalarTile           = 0x00001050, // Scalar from a tile
     Tile                 = 0x00010000, // Tile
     TensorStride1        = 0x00100001, // Tensor component
     TensorStride2        = 0x00100002, // Tensor component
     TensorStride3        = 0x00100003, // Tensor component
     TensorStride4        = 0x00100004, // Tensor component
     TensorDim0           = 0x00100010, // Tensor component
     TensorDim1           = 0x00100020, // Tensor component
     TensorDim2           = 0x00100030, // Tensor component
     TensorDim3           = 0x00100040, // Tensor component
     TensorDim4           = 0x00100050, // Tensor component
     TensorC              = 0x00100010, // Tensor component
     TensorW              = 0x00100020, // Tensor component
     TensorH              = 0x00100030, // Tensor component
     TensorD              = 0x00100040, // Tensor component
     TensorN              = 0x00100050, // Tensor component
     TensorDim1xDim2      = 0x00100100, // Tensor component
     TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
     TensorWxH            = 0x00100300, // Tensor component
     TensorWxHxD          = 0x00100400, // Tensor component
     TensorDataOffset     = 0x00100500, // Tensor component
 };

 struct ScalarTileCoord
 {
     ScalarTileCoord() {}
     ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
     int32_t x { -1 };
     int32_t y { -1 };
 };
 /**
  * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
  * Operand can be of three types:
  * -# Scalar immediate: constant expression
  * -# Tile: A tile
  * -# Tensor component: A component (scalar) of a tensor
  *
  */
 class Operand
 {
 public:
     Operand(const std::string &val)
     {
         _str  = val;
         _type = OperandType::Tile;
     }

     Operand(const std::string &val, const ScalarTileCoord& coord)
     {
         _str   = val;
         _type  = OperandType::ScalarTile;
         _coord = coord;
     }

     Operand(const std::string &val, OperandType type)
     {
         _str  = val;
         _type = type;
     }

     Operand(const Operand& t)
     {
         _str  = t.value();
         _type = t.type();
     }

     Operand& operator=(const Operand& t)
     {
         _str   = t.value();
         _type  = t.type();
         _coord = t.scalar_tile_coordinate();
         return *this;
     }

     std::string value() const
     {
         return _str;
     }

     OperandType type() const
     {
         return _type;
     }

     ScalarTileCoord scalar_tile_coordinate() const
     {
         return _coord;
     }

 private:
     std::string     _str {};
     OperandType     _type { OperandType::Unknown };
     ScalarTileCoord _coord {};
 };

 enum class GpuSamplerTensorStorage : int32_t
 {
     Unknown          = static_cast<int32_t>(GpuTensorStorage::Unknown),
     BufferUint8Ptr   = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
     Image2dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
     Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
     Image3dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
     Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
 };

 struct GpuSampler
 {
     GpuSampler() = default;
     TensorSamplerFormat  format { TensorSamplerFormat::Unknown };
     GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
     TensorSamplerAddressModeX  address_mode_x { TensorSamplerAddressModeX::Unknown };
     TensorSamplerAddressModeY  address_mode_y { TensorSamplerAddressModeY::Unknown };
     TensorSamplerAddressModeZ  address_mode_z { TensorSamplerAddressModeZ::Unknown };
 };

 inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
 {
     CKW_UNUSED(step_x, step_y, step_z);

     auto tensor = tensor_info_id->shape;

     GpuSampler dst_sampler;
     dst_sampler.format         = sampler.format;
     dst_sampler.storage        = GpuSamplerTensorStorage::BufferUint8Ptr;
     dst_sampler.address_mode_x = sampler.address_mode_x;
     dst_sampler.address_mode_y = sampler.address_mode_y;
     dst_sampler.address_mode_z = sampler.address_mode_z;

     int32_t dim_x = 0;
     int32_t dim_y = 0;
     int32_t dim_z = 0;

     switch(sampler.format)
     {
         case TensorSamplerFormat::C_W_H:
         dim_x = tensor[0];
         dim_y = tensor[1];
         dim_z = tensor[2];
         break;
         case TensorSamplerFormat::C_WH_1:
         dim_x = tensor[0];
         dim_y = tensor[1] * tensor[2];
         dim_z = 1;
         break;
         default:
         std::cout << "Unsupported tensor format" << std::endl;
         assert(false);
         break;
     }

     if(dim_x == 1)
     {
         assert(step_x == 1);
         dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
     }

     if(dim_y == 1)
     {
         assert(step_y == 1);
         dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
     }

     if(dim_z == 1)
     {
         assert(step_z == 1);
         dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
     }

     return dst_sampler;
 }

 class GpuOutputSampler
 {
 public:
     GpuOutputSampler() = default;
     /**
      * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
      *        by the root component. Once initialized, all simpler components will need to used this sampler
      *        or a broadcasted version of it
      *
      * @param[in] sampler GpuSampler
      * @param[in] step_x  Increment step in the X direction. Not necessarily it is the same of n0 of tile!
      * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
      * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
      */
     void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
     {
         assert(_is_initialized == false);

         _step_x         = step_x;
         _step_y         = step_y;
         _step_z         = step_z;
         _tensor_info_id = tensor_info_id;
         _sampler        = create_sampler(tensor_storage, tensor_format);
         _is_initialized = true;
     };

     GpuSampler sampler() const
     {
         return _sampler;
     };

     int32_t step_x() const
     {
         return _step_x;
     };

     int32_t step_y() const
     {
         return _step_y;
     };

     int32_t step_z() const
     {
         return _step_z;
     };
 private:
     GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
     {
         // Output can only be in output mode
         assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
         assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);

         auto tensor = _tensor_info_id->shape;

         GpuSampler sampler;
         sampler.format         = tensor_format;
         sampler.storage        = tensor_storage;
         sampler.address_mode_x = TensorSamplerAddressModeX::None;
         sampler.address_mode_y = TensorSamplerAddressModeY::None;
         sampler.address_mode_z = TensorSamplerAddressModeZ::None;

         // In the case of texture, we do not need any special checks at the border
         if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             int32_t dim_x = 0;
             int32_t dim_y = 0;
             int32_t dim_z = 0;

             switch(tensor_format)
             {
                 case TensorSamplerFormat::C_W_H:
                 dim_x = tensor[0];
                 dim_y = tensor[1];
                 dim_z = tensor[2];
                 break;
                 case TensorSamplerFormat::C_WH_1:
                 dim_x = tensor[0];
                 dim_y = tensor[1] * tensor[2];
                 dim_z = 1;
                 break;
                 default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 break;
             }

             if((dim_x % _step_x) != 0 && dim_x != 1)
             {
                 sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
             }

             if((dim_y % _step_y) != 0 && dim_y != 1)
             {
                 sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
             }

             if((dim_z % _step_z) != 0 && dim_z != 1)
             {
                 sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
             }
         }

         return sampler;
     }
     GpuSampler              _sampler { }; // GpuSampler
     int32_t                 _step_x { 1 };
     int32_t                 _step_y { 1 };
     int32_t                 _step_z { 1 };
     const TensorInfo* _tensor_info_id { nullptr };
     bool                    _is_initialized { false };
 };

 /**
  * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
  */
 class TensorOperand
 {
 public:
     TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
     {
     }

     TensorOperand& operator=(const TensorOperand& t)
     {
         _str     = t.value();
         _sampler = t.sampler();
         return *this;
     }

     std::string value() const
     {
         return _str;
     }

     GpuSampler sampler() const
     {
         return _sampler;
     }

 private:
     std::string _str {};
     GpuSampler  _sampler {};
 };

 /**
  * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
  *        This data structure must be initialized before being passed to the Gpu Kernel Writer
  *
  */
 class GpuKernelWriterDataHolder
 {
 public:
     /**
      * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
      *        the GPU target and target specific capabilities (extensions). For now, we just initialize the
      *        programming language
      *
      * @param[in] language Gpu programming language to use
      */
     GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
     {
     }
     /**
      * @brief Get the Gpu programming language used
      *
      * @return GpuTargetLanguage the Gpu programming language
      */
     GpuTargetLanguage programming_language() const
     {
         return _language;
     }
     /**
      * @brief @ref GpuTileRegistry
      *
      */
     GpuTileRegistry tiles{};
     /**
      * @brief @ref GpuTensorArgumentRegistry
      *
      */
     GpuTensorArgumentRegistry arguments{};
     /**
      * @brief @ref GpuOutputSampler.
      *
      */
     GpuOutputSampler output_sampler{};
     /**
      * @brief Source code
      *
      */
     std::string code{};

     // GpuExtensionRegistry extensions{};
 private:
     GpuTargetLanguage _language;
 };

 struct LWS
 {
     int32_t x {1};
     int32_t y {1};
     int32_t z {1};
 };

 /**
  * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
  *        declare an anonymous tile in the tile registry.
  */
 class OperandUnpacker
 {
 public:
     OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
     {
         // Increase the level of the stack to allocate possible temporary tiles
         _tiles.increment_registry_level();
     };

     ~OperandUnpacker()
     {
         // Decrease the level of the stack to deallocate any temporary tiles
         _tiles.decrement_registry_level();
     }

     IVectorTile* unpack(const Operand& src)
     {
         // Get the tile
         if(src.type() == OperandType::Tile)
         {
             assert(_tiles.has_tile(src.value()));
             return _tiles[src.value()];
         }
         // Create an anonymous tile with a constant
         else if(static_cast<int32_t>(src.type()) & 0x00001000)
         {
             if(src.type() == OperandType::ScalarTile)
             {
                 ScalarTileCoord coord = src.scalar_tile_coordinate();
                 assert(_tiles.has_tile(src.value()));
                 assert(coord.x >= 0);
                 assert(coord.y >= 0);
                 auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
                 return _tiles.insert({{{val.str}}}, val.type.dt);
             }
             else
             {
                 return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
             }
         }
         // Create an anonymous tile with the tensor component
         else
         {
             assert(_arguments.has_tensor_argument(src.value()));
             auto x = _arguments[src.value()];
             const std::string val = x->component(to_tensor_component(src.type()));
             const DataType    dt  = x->component_data_type();
             return _tiles.insert({{{val}}}, dt);
         }
     }

 private:
     DataType to_tile_data_type(OperandType x)
     {
         return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
     }

     TensorComponent to_tensor_component(OperandType x)
     {
         switch(x)
         {
             case OperandType::TensorDim0:
             return TensorComponent::Dim0;
             case OperandType::TensorDim1:
             return TensorComponent::Dim1;
             case OperandType::TensorDim2:
             return TensorComponent::Dim2;
             case OperandType::TensorDim3:
             return TensorComponent::Dim3;
             case OperandType::TensorDim4:
             return TensorComponent::Dim4;
             case OperandType::TensorStride1:
             return TensorComponent::Stride1;
             case OperandType::TensorStride2:
             return TensorComponent::Stride2;
             case OperandType::TensorStride3:
             return TensorComponent::Stride3;
             case OperandType::TensorStride4:
             return TensorComponent::Stride4;
             case OperandType::TensorDim1xDim2:
             return TensorComponent::Dim1xDim2;
             case OperandType::TensorDim1xDim2xDim3:
             return TensorComponent::Dim1xDim2xDim3;
             case OperandType::TensorDataOffset:
             return TensorComponent::OffsetFirstElement;
             default:
             assert(false);
             return TensorComponent::Unknown;
         }
     }

     GpuTileRegistry&       _tiles;
     GpuTensorArgumentRegistry& _arguments;
 };

 /**
  * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
  *        declare an anonymous tile in the tile registry.
  *        Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
  */
 class TensorOperandUnpacker
 {
 public:
     TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
     {
     };

     IGpuTensorArgument* unpack(const TensorOperand& src)
     {
         assert(_arguments.has_tensor_argument(src.value()));
         return _arguments[src.value()];
     }

 private:
     GpuTensorArgumentRegistry& _arguments;
 };

 /**
  * @brief The GpuKernel will be used in three occasions (stages):
  * #- Compilation stage
  * #- Tuning stage
  * #- Dispatch stage
  */
 struct GpuKernel
 {
     // Compilation stage
     std::string                                       code {};               // Source code, required for the compilation stage
     std::vector<GpuExtensions>                        list_extensions{};     // Extensions, required for the compilation stage
     // Tuning stage
     std::string                                       config_id {};          // Unique id, required for the tuning stage
     std::vector<LWS>                                  list_lws{};            // LWS to test, required for the tuning stage
     // Dispatch stage
     GpuOutputSampler                                  output_sampler{};      // GpuOutputSampler, required for the dispatch stage
     std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages;  // List of tensor storages, required for the dispatch stage
     std::vector<std::pair<int32_t, TensorComponent>>  list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
 };

 // This function should produce an object with the source
 inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
 {
     std::string code;
     code += "__kernel void ";
     code += name;
     code += "(\n";

     auto IdSpaces = in.arguments.IdSpace_declarations();

     std::vector<std::string> arg_str;

     auto tensor_args = in.arguments.tensor_argument_declarations();

     for(auto &i : tensor_args)
     {
         // For each tensor used, get the storage and tensor components
         auto storages   = i->storage_declarations();
         auto components = i->component_declarations();

         for(auto &y : storages)
         {
             std::string str;
             str += i->storage_type_declaration(y);
             str += " ";
             str += i->storage(y);
             arg_str.push_back(str);
         }

         for(auto &y : components)
         {
             std::string str;
             str += i->component_type_declaration();
             str += " ";
             str += i->component(y);
             arg_str.push_back(str);
         }
     }

     for(size_t i = 0; i < arg_str.size(); ++i)
     {
         code += arg_str[i];
         if(i + 1 < arg_str.size())
         {
             code += ",\n";
         }
     }

     code += ")\n";
     code += "{\n";
     code += in.code;
     code += "}\n";

     return code;
 }

 /**
  * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
  * how to reduce the dimensionality of a tensor
  *
  */
 class GpuTensor3dMapper
 {
 public:
     GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
     {
     };

     std::string tensor_component_x() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::C);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     std::string tensor_component_y() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return _tensor->component(TensorComponent::WxH);
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::W);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     std::string tensor_component_z() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "1";
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::H);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     std::string tensor_component_stride_y() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::Stride1);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     std::string tensor_component_stride_z() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "0";
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::Stride2);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     std::string tensor_component_stride_batch() const
     {
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
                 return _tensor->component(TensorComponent::Stride3);
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     bool is_one_component_x() const
     {
         auto t = _tensor->format();
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
                 return t.shape[0] == 1;
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     bool is_one_component_y() const
     {
         auto t = _tensor->format();
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return (t.shape[1] * t.shape[2]) == 1;
             case TensorSamplerFormat::C_W_H:
                 return t.shape[1] == 1;
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     bool is_one_component_z() const
     {
         auto t = _tensor->format();
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return true;
             case TensorSamplerFormat::C_W_H:
                 return t.shape[2] == 1;
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     bool is_one_component_batch() const
     {
         auto t = _tensor->format();
         const auto format = _sampler.format;
         switch(format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
                 return t.shape[3] == 1;
             default:
                 std::cout << "Unsupported tensor format" << std::endl;
                 assert(false);
                 return "";
         }
     }

     GpuSampler gpu_sampler() const
     {
         return _sampler;
     }

     IGpuTensorArgument* tensor_argument() const
     {
         return _tensor;
     }

 private:
     GpuSampler          _sampler;
     IGpuTensorArgument* _tensor;
 };

 struct GpuKernelWriterAttribute
 {
     bool return_tensor_component_by_value { false };
 };

 enum class ConvertPolicy
 {
     Wrap,    /**< Wrap around */
     Saturate /**< Saturate */
 };

 enum class RoundingMode
 {
     None,
     Rte,
     Rtz,
     Rtp,
     Rtn
 };

 // https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
 class IGpuKernelWriter
 {
 public:
     virtual ~IGpuKernelWriter() = default;
     virtual void set_IdSpace(int32_t id) = 0;
     virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
     virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
     virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
     virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
     virtual void write_text(const std::string& x) = 0;
     virtual void compound_statement_begin() = 0;
     virtual void compound_statement_end() = 0;

     // Operations
     virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
     virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
     virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
     virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
     virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
     virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
     virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
     virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
     virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
     virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
     virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
     virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
     virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
     virtual void op_return() = 0;
     // virtual void op_else() = 0;
     // virtual void op_elseif() = 0;
     // Utils
     // It is the process of converting
     virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
 };

 enum class GpuLoadStoreType
 {
     Load  = 1,
     Store = 2
 };

 class IGpuLoadStoreHelperWriter
 {
 public:
     IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
     IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
     IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
     virtual ~IGpuLoadStoreHelperWriter() = default;
     virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
     virtual void write(const std::pair<int32_t, std::string>& y) = 0;
     virtual void finalize() = 0;
 protected:
     IGpuKernelWriter* _writer;
     GpuTensor3dMapper _mapper;
     GpuLoadStoreType  _type;
 };

 class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
 {
 public:
     ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
     {
     }

     ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
     ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;

     static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x, type, dst);

         if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             return false;
         }
         return true;
     }

     void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
     {
         assert(validate(_writer, _mapper, _type, dst));

         _dst           = dst;
         _ls_width_full = dst->format().w;

         _coord_x = x->scalar(0, 0).str;
         _coord_z = z->scalar(0, 0).str;
         _coord_b = b->scalar(0, 0).str;
         _coord_orig_z = _coord_z;

         out_of_bound_initialize_x(_coord_x);
         out_of_bound_initialize_z(_coord_z);

         /*
         meaning of else:
         - x: partial load/store
         - y: no load/store operation
         - z: no load/store operation
         if(x)
         {
             if(z)
             {
                 if(y)
                 {
                     // full load/store width
                 }
                 else
                 {
                     // no load/store
                 }
             }
             else
             {
                 // no load/store
             }
         }
         else
         {
             if(z)
             {
                 if(y)
                 {
                     // partial load/store width
                 }
                 else
                 {
                     // no load/store
                 }
             }
             else
             {
                 // no load/store
             }
         }
         */
     }

     void write(const std::pair<int32_t, std::string>& y) override
     {
         int32_t     idx_y   = y.first;
         std::string coord_y = y.second;

         // The only check required is on Y.
         out_of_bound_initialize_y(coord_y);

         const std::string dst     = _dst->vector(idx_y).str;
         const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
         const std::string ls_buf  = to_ls_buffer(_type, _ls_width_full, dst, address);

         _writer->write_text(ls_buf);
         _writer->write_text(";\n");

         out_of_bound_finalize_y(dst);

         // The left over load/store will be written in the finalize stage
         if(_ls_width_part.size() != 0)
         {
             int32_t w = 0;
             for(auto &p : _ls_width_part)
             {
                 const std::string dst0    = _dst->vector(w, p, idx_y).str;
                 const std::string coord_x = _coord_x + " + " + std::to_string(w);
                 const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
                 const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
                 _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));

                 w += p;
             }
         }
     }

     void finalize() override
     {
         out_of_bound_finalize_z();
         out_of_bound_finalize_x();
     }
 private:
     IVectorTile*            _dst { nullptr };
     int32_t                  _ls_width_full { 0 };
     std::vector<int32_t>     _ls_width_part { };
     std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
     std::string              _coord_x {};
     std::string              _coord_z {};
     std::string              _coord_orig_z {};
     std::string              _coord_b {};

     void out_of_bound_initialize_x(std::string& coord)
     {
         if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
             auto tensor_format = _mapper.tensor_argument()->format();
             auto shape         = tensor_format.shape;

             _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
             if(_ls_width_part.size() != 0)
             {
                 _writer->write_text("if(" + coord + " > 0)\n");
                 _writer->compound_statement_begin();
             }
         }
     };

     void out_of_bound_finalize_x()
     {
         if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
             if(_ls_width_part.size() != 0)
             {
                 _writer->compound_statement_end();
                 _writer->write_text("else\n");
                 _writer->compound_statement_begin();

                 out_of_bound_initialize_z(_coord_orig_z);
                 for(auto &i : _leftovers_x)
                 {
                     out_of_bound_initialize_y(i.first.second);
                     _writer->write_text(i.second);
                     _writer->write_text(";\n");
                     out_of_bound_finalize_y(i.first.first);
                 }
                 out_of_bound_finalize_z();
                 _writer->compound_statement_end();
             }
         }
     };

     void out_of_bound_initialize_y(std::string& coord)
     {
         std::string max = "";

         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::ClampToBorder:
             // NOTE: This line should not be moved outside of the switch statement.
             // The reason for that is because when we query the component, the component is marked as used
             // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
             // we should request the component only when used
             max = _mapper.tensor_component_y();
             _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
             _writer->write_text("if(" + coord + " >= 0)\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
             max = _mapper.tensor_component_y();
             _writer->write_text("if(" + coord + " < " + max + ")\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::ClampToNearest:
             max = _mapper.tensor_component_y();
             coord = "clamp(" + coord + ", 0, " + max + " - 1)";
             break;
             case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
             max = _mapper.tensor_component_y();
             coord = "min(" + coord + ", " + max + " - 1)";
             break;
             case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
             coord = "max(" + coord + ", 0)";
             break;
             case TensorSamplerAddressModeY::None:
             break;
             default:
             std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
             assert(false);
         }
     };

     void out_of_bound_finalize_y(const std::string& dst)
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
             _writer->compound_statement_end();
             break;

             default:
                 assert(false);
         }

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
             _writer->write_text("else\n");
             _writer->compound_statement_begin();
             _writer->write_text(dst);
             _writer->write_text(" = 0.0f;\n");
             _writer->compound_statement_end();
             break;

             default:
                 assert(false);
         }
     };

     void out_of_bound_initialize_z(std::string& coord)
     {
         std::string max = "";

         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;

         switch(address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
             max = _mapper.tensor_component_z();
             _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
             _writer->write_text("if(" + coord + " >= 0)\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
             max = _mapper.tensor_component_z();
             _writer->write_text("if(" + coord + " < " + max + ")\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeZ::ClampToNearest:
             max = _mapper.tensor_component_z();
             coord = "clamp(" + coord + ", 0, " + max + " - 1)";
             break;
             case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
             max = _mapper.tensor_component_z();
             coord = "min(" + coord + ", " + max + " - 1)";
             break;
             case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
             coord = "max(" + coord + ", 0)";
             break;
             case TensorSamplerAddressModeZ::None:
             break;
             default:
             std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
             assert(false);
         }
     };

     void out_of_bound_finalize_z()
     {
         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;

         switch(address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
             case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
             case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
             _writer->compound_statement_end();
             break;

             default:
                 assert(false);
         }
     };

     std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
     {
         std::vector<int32_t> x;

         switch(ls_leftover_vector_width)
         {
             case 0:
                 break;
             case 1:
             case 2:
             case 3:
             case 4:
             case 8:
             case 16:
                 x.push_back(ls_leftover_vector_width);
                 break;
             case 5:
                 x.push_back(4);
                 x.push_back(1);
                 break;
             case 6:
                 x.push_back(4);
                 x.push_back(2);
                 break;
             case 7:
                 x.push_back(4);
                 x.push_back(3);
                 break;
             case 9:
                 x.push_back(8);
                 x.push_back(1);
                 break;
             case 10:
                 x.push_back(8);
                 x.push_back(2);
                 break;
             case 11:
                 x.push_back(8);
                 x.push_back(3);
                 break;
             case 12:
                 x.push_back(8);
                 x.push_back(4);
                 break;
             case 13:
                 x.push_back(8);
                 x.push_back(4);
                 x.push_back(1);
                 break;
             case 14:
                 x.push_back(8);
                 x.push_back(4);
                 x.push_back(2);
                 break;
             case 15:
                 x.push_back(8);
                 x.push_back(4);
                 x.push_back(3);
                 break;

             default:
                 assert(false);
         }
         return x;
     }

     std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
     {
         switch(type)
         {
             case GpuLoadStoreType::Load:
             if(vector_width != 1)
             {
                 return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
             }
             else
             {
                 return data + " = *(" + address + ")";
             }
             break;
             case GpuLoadStoreType::Store:
             if(vector_width != 1)
             {
                 return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
             }
             else
             {
                 return "*(" + address + ") = " + data;
             }
             break;
             default:
             std::cout << "Unsupported GpuLoadStoreType" << std::endl;
             assert(false);
             return "";
         }
     }

     std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
     {
         auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
         const std::string ptr_buf      = _mapper.tensor_argument()->storage(tensor_storage);
         const std::string dst_type     = get_cl_data_type(_dst->format().dt, 1);

         std::string address;
         address += "(__global ";
         address += dst_type;
         address += "*)(";
         address += ptr_buf;
         if(x != "0" && (_mapper.is_one_component_x() != true))
         {
             address += " + (";
             address += x + ") * sizeof(" + dst_type + ")";
         }
         if(y != "0" && (_mapper.is_one_component_y() != true))
         {
             const std::string stride_y = _mapper.tensor_component_stride_y();
             address += " + (";
             address += y + ")";
             address += " * ";
             address += stride_y;
         }
         if(z != "0" && (_mapper.is_one_component_z() != true))
         {
             const std::string stride_z = _mapper.tensor_component_stride_z();
             address += " + (";
             address += z + ")";
             address += " * ";
             address += stride_z;
         }
         if(b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string stride_b = _mapper.tensor_component_stride_batch();
             address += " + (";
             address += b + ")";
             address += " * ";
             address += stride_b;
         }
         address += ")";
         return address;
     }
 };

 class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
 {
 public:
     static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x);

         if(dst->format().w != 4)
         {
             return false;
         }
         if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
         {
             return false;
         }
         if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
         {
             return false;
         }
         if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
         {
             return false;
         }
         if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
         {
             return false;
         }
         if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
         {
             return false;
         }
         return true;
         /*
         - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
         - z: Only GpuSamplerAddressModeZ::None is supported
         */
     }
     ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
     {
     }

     ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
     ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;

     void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
     {
         assert(validate(_writer, _mapper, _type, dst));

         _dst           = dst;
         _ls_width_full = dst->format().w;
         _coord_x       = x->scalar(0, 0).str;
         _coord_z       = z->scalar(0, 0).str;
         _coord_b       = b->scalar(0, 0).str;

         /*
         if(y)
         {
             // full load/store width
         }
         else
         {
             // no load/store
         }
         */
     }

     void write(const std::pair<int32_t, std::string>& y) override
     {
         int32_t     idx_y   = y.first;
         std::string coord_y = y.second;

         // The only check required is on Y.
         out_of_bound_initialize_y(coord_y);

         const std::string dst     = _dst->vector(idx_y).str;
         const std::string sampler = to_ls_image2d_sampler();
         const std::string coord   = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
         const std::string ls_buf  = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);

         _writer->write_text(ls_buf);
         _writer->write_text(";\n");

         out_of_bound_finalize_y(dst);
     }

     void finalize() override
     {
     }
 private:
     IVectorTile*   _dst { nullptr };
     int32_t     _ls_width_full { 0 };
     std::string _coord_x {};
     std::string _coord_z {};
     std::string _coord_b {};

     void out_of_bound_initialize_y(std::string& coord)
     {
         std::string max = "";

         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             max = _mapper.tensor_component_y();
             _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
             _writer->write_text("if(" + coord + " >= 0)\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
             max = _mapper.tensor_component_y();
             _writer->write_text("if(" + coord + " < " + max + ")\n");
             _writer->compound_statement_begin();
             break;
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToNearest:
             case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
             case TensorSamplerAddressModeY::None:
             break;
             default:
             std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
             assert(false);
         }
     };

     void out_of_bound_finalize_y(const std::string& dst)
     {
         CKW_UNUSED(dst);

         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
             case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
             _writer->compound_statement_end();
             break;

             default:
                 assert(false);
         }
     };

     std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
     {
         CKW_UNUSED(vector_width);

         auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         const std::string image2d_obj  = _mapper.tensor_argument()->storage(tensor_storage);
         // const DataType dt              = _dst->format().dt;
         const std::string post_fix     = _dst->format().dt == DataType::Fp32? "f" : "h";

         switch(type)
         {
             case GpuLoadStoreType::Load:
             return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
             break;
             case GpuLoadStoreType::Store:
             return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
             default:
             assert(false);
             std::cout << "Unsupported GpuLoadStoreType" << std::endl;
             assert(false);
             return "";
         }
     }

     std::string to_ls_image2d_sampler() const
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;

         switch(address_mode_y)
         {
             case TensorSamplerAddressModeY::None:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
             case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
             case TensorSamplerAddressModeY::ClampToNearest:
             case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
             case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
             default:
             std::cout << "Unsupported address_mode_coord" << std::endl;
             assert(false);
             return "";
         }
     }

     std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
     {
         std::string coord_x = "(" + x + ") >> 2";
         std::string coord_y = "(";

         if(y != "0" && (_mapper.is_one_component_y() != true))
         {
             coord_y += y;
         }
         if(z != "0" && (_mapper.is_one_component_z() != true))
         {
             const std::string dim = _mapper.tensor_component_y();
             coord_y += " + (";
             coord_y += z + ")";
             coord_y += " * ";
             coord_y += dim;
         }
         if(b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string dim0 = _mapper.tensor_component_y();
             const std::string dim1 = _mapper.tensor_component_z();
             coord_y += " + (";
             coord_y += b + ")";
             coord_y += " * ";
             coord_y += dim0;
             coord_y += " * ";
             coord_y += dim1;
         }
         coord_y += ")";
         return "(int2)(" + coord_x + ", " + coord_y + ")";
     }
 };

 /** IGpuLoadStoreHelperWriter factory class */
 class ClLoadStoreHelperWriterFactory final
 {
 public:
     /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
      *
      *
      * @return IGpuLoadStoreHelperWriter
      */
     static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
     {
         const auto tensor_storage = mapper.gpu_sampler().storage;
         switch(tensor_storage)
         {
             case GpuSamplerTensorStorage::BufferUint8Ptr:
                 return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
             case GpuSamplerTensorStorage::Image2dReadOnly:
             case GpuSamplerTensorStorage::Image2dWriteOnly:
                 return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
             default:
                 std::cout << "Unsupported Gpu tensor storage" << std::endl;
                 assert(false);
                 return nullptr;
         }
     }
 };

 // This utility method needs to go in utils.h
 inline bool is_tile_scalar(IVectorTile* x)
 {
     return x->format().w == 1 && x->format().h == 1;
 }

 class ClKernelWriter : public IGpuKernelWriter
 {
 public:
     ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
     {
         _data = x;
         _attr = attr;
     }

     ClKernelWriter(const ClKernelWriter &) = default;
     ClKernelWriter &operator=(const ClKernelWriter &) = default;

     // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
     // there are no conflicts or ambiguity in the code
     void set_IdSpace(int32_t id) override
     {
         _data->tiles.set_IdSpace(id);
         _data->arguments.set_IdSpace(id);
     }

     void import_tile(const std::string& dst_name, const IVectorTile *src) override
     {
         _data->tiles.insert(dst_name, src);
     }

     void declare_argument(const std::string& name, const TensorInfo& tensor) override
     {
         assert(_data->arguments[name] == nullptr);
         _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
     }

     void declare_tile(const std::string& name, const TileInfo& format) override
     {
         assert(_data->tiles[name] == nullptr);
         _data->tiles.insert(name, format);

         IVectorTile *x = _data->tiles[name];

         for(auto &t : x->underlying_source_variables())
         {
             _data->code += t.type.str + " " + t.str + ";\n";
         }
     }

     void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
     {
         assert(_data->tiles[name] == nullptr);
         _data->tiles.insert(name, in, dt);
         // Note: A constant does not need to be declared in the code
     }

     void write_text(const std::string& x) override
     {
         _data->code += x;
     }

     void compound_statement_begin() override
     {
         _data->tiles.increment_registry_level();
         _data->code += "{\n";
     }

     void compound_statement_end() override
     {
         _data->tiles.decrement_registry_level();
         _data->code += "}\n";
     }

     void op_get_global_id(const Operand& dst_var, int32_t dim) override
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
         assert(_data->tiles[dst_var.value()]->format().w == 1 &&
                _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable

         auto var = _data->tiles[dst_var.value()];

         _data->code += var->scalar(0, 0).str;
         _data->code += " = get_global_id(";
         _data->code += std::to_string(dim);
         _data->code += ");\n";
     };

     void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto dst  = operands.unpack(o_dst);
         auto step = operands.unpack(o_step);

         // Validation: Check that x, y and z are scalar

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor      = tensor_operands.unpack(o_tensor);
         auto gpu_sampler = o_tensor.sampler();

         GpuTensor3dMapper mapper(tensor, gpu_sampler);

         switch (dim)
         {
         case 0:
             if(mapper.is_one_component_x())
             {
                 _data->code += dst->scalar(0, 0).str;
                 _data->code += " = 0;\n";
             }
             else
             {
                 if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
                 {
                     // Validation: Check: fixed tensor shape
                     // TO BE CHANGED
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = get_global_id(0) * ";
                     _data->code += step->scalar(0, 0).str;
                     _data->code += ";\n";
                 }
                 else
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = get_global_id(0) * ";
                     _data->code += step->scalar(0, 0).str;
                     _data->code += ";\n";
                 }
             }
             break;
         case 1:
             if(mapper.is_one_component_y())
             {
                 _data->code += dst->scalar(0, 0).str;
                 _data->code += " = 0;\n";
             }
             else
             {
                 if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
                 {

                 }
                 else
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = get_global_id(1) * ";
                     _data->code += step->scalar(0, 0).str;
                     _data->code += ";\n";
                 }
             }
             break;
         case 2:
             if(mapper.is_one_component_z())
             {
                 _data->code += dst->scalar(0, 0).str;
                 _data->code += " = 0;\n";
             }
             else
             {
                 _data->code += dst->scalar(0, 0).str;
                 _data->code += " = get_global_id(2) * ";
                 _data->code += step->scalar(0, 0).str;
                 _data->code += ";\n";
             }
             break;
         default:
             break;
         }
     };

     void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto dst  = operands.unpack(o_dst);

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor      = tensor_operands.unpack(o_tensor);
         auto gpu_sampler = o_tensor.sampler();

         GpuTensor3dMapper mapper(tensor, gpu_sampler);

         if(mapper.is_one_component_batch())
         {
             _data->code += dst->scalar(0, 0).str;
             _data->code += " = 0;\n";
         }
         else
         {
             std::cout << "Unsupported batched computation" << std::endl;
             assert(false);
         }
     };

     void op_get_global_size(const Operand& dst_var, int32_t dim) override
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
         assert(_data->tiles[dst_var.value()]->format().w == 1 &&
                _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable

         auto var = _data->tiles[dst_var.value()];

         _data->code += var->scalar(0, 0).str;
         _data->code += " = get_global_size(";
         _data->code += std::to_string(dim);
         _data->code += ");\n";
     }

     void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto lhs = operands.unpack(lhs_name);
         auto rhs = operands.unpack(rhs_name);
         auto dst = operands.unpack(dst_name);

         const int32_t dst_w = dst->format().w;
         const int32_t dst_h = dst->format().h;
         assert(lhs != nullptr);
         const int32_t lhs_w = lhs->format().w;
         const int32_t rhs_w = rhs->format().w;

         if(op == BinaryOp::MatMul_Nt_T)
         {
             assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
             for(int32_t y = 0; y < dst_h; ++y)
             {
                 for(int32_t x = 0; x < dst_w; ++x)
                 {
                     for(int32_t k = 0; k < lhs_w; ++k)
                     {
                         _data->code += dst->scalar(x, y).str;
                         _data->code += " = fma(";
                         _data->code += lhs->scalar(k, y).str;
                         _data->code += ", ";
                         _data->code += rhs->scalar(k, x).str;
                         _data->code += ", ";
                         _data->code += dst->scalar(x, y).str;
                         _data->code += ");\n";
                     }
                 }
             }

             return;
         }

         bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
         bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;

         std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
         std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
         std::string op_str     = to_string(op);

         // Broadcasting on Y is automatic
         for(int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
             _data->code += lhs_prefix + lhs->vector(y).str;
             _data->code += " ";
             _data->code += op_str;
             _data->code += " ";
             _data->code += rhs_prefix + rhs->vector(y).str;
             _data->code += ";\n";
         }
     };

     void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
     {
         CKW_UNUSED(policy);

         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto src = operands.unpack(o_src);
         auto dst = operands.unpack(o_dst);

         // const int32_t dst_w  = dst->format().w;
         const int32_t dst_h  = dst->format().h;
         const std::string dt = dst->scalar(0, 0).type.str;

         // Broadcasting on Y is automatic
         for(int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = convert_" + dt + "(";
             _data->code += src->vector(y).str;
             _data->code += ");\n";
         }
     };

     void op_assign(const Operand& dst_name, const Operand& src_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto src = operands.unpack(src_name);
         auto dst = operands.unpack(dst_name);

         const int32_t dst_w  = dst->format().w;
         const int32_t dst_h  = dst->format().h;
         const int32_t src_w  = src->format().w;
         // const int32_t src_h  = src->format().h;
         const std::string dt = dst->scalar(0, 0).type.str;

         bool broadcast_src_x = dst_w != 1 && src_w == 1;

         std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";

         // Broadcasting on Y is automatic
         for(int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
             _data->code += src_prefix + src->vector(y).str;
             _data->code += ";\n";
         }
     }

     void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto src = operands.unpack(src_name);
         auto dst = operands.unpack(dst_name);

         const int32_t dst_w  = dst->format().w;
         const int32_t dst_h  = dst->format().h;
         const int32_t src_w  = src->format().w;
         // const int32_t src_h  = src->format().h;
         const std::string dt = dst->scalar(0, 0).type.str;

         bool broadcast_src_x = dst_w != 1 && src_w == 1;

         std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";

         // Broadcasting on Y is automatic
         for(int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";

             switch(func)
             {
                 case ScalarUnaryFunction::Exp:
                     _data->code += "exp(";
                     break;

                 default:
                     CKW_ASSERT(false);
             }

             _data->code += src_prefix + src->vector(y).str;
             _data->code += ");\n";
         }
     }

     void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto lhs = operands.unpack(o_lhs);
         auto rhs = operands.unpack(o_rhs);

         assert(is_tile_scalar(lhs));
         assert(is_tile_scalar(rhs));

         _data->code += "if(";
         _data->code += lhs->scalar(0, 0).str;
         _data->code += " ";
         _data->code += to_string(op);
         _data->code += " ";
         _data->code += rhs->scalar(0, 0).str;
         _data->code += ")\n";
     }

     void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto var          = operands.unpack(var_name);
         auto cond_value   = operands.unpack(cond_value_name);
         auto update_value = operands.unpack(update_value_name);

         const int32_t dst_w = var->format().w;
         const int32_t dst_h = var->format().h;

         // It must be a scalar variable
         CKW_UNUSED(dst_w, dst_h);
         assert(dst_w == 1);
         assert(dst_h == 1);

         _data->code += "for(; " ;
         _data->code += var->scalar(0, 0).str;
         _data->code += " ";
         _data->code += to_string(cond_op);
         _data->code += " " + cond_value->scalar(0, 0).str + "; ";
         _data->code += var->scalar(0, 0).str;
         _data->code += " ";
         _data->code += to_string(update_op);
         _data->code += " " + update_value->scalar(0, 0).str + ")";
         _data->code += "\n";
     }

     void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto dst      = operands.unpack(o_dst);
         auto x        = operands.unpack(o_x);
         auto y        = operands.unpack(o_y);
         auto z        = operands.unpack(o_z);
         auto dil_y    = operands.unpack(dilation_y);
         auto b        = operands.unpack(o_batch_idx);

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor      = tensor_operands.unpack(o_tensor);
         auto gpu_sampler = o_tensor.sampler();

         GpuTensor3dMapper mapper(tensor, gpu_sampler);

         auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);

         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);

         for(int i = 0; i < dst->format().h; ++i)
         {
             std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
             if(dil_y->scalar(0, 0).str != "1")
             {
                 coord_y += " * " + dil_y->scalar(0, 0).str;
             }
             load_writer->write(std::make_pair(i, coord_y));
         }

         load_writer->finalize();
     }

     void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto dst      = operands.unpack(o_dst);
         auto x        = operands.unpack(o_x);
         auto y_ind    = operands.unpack(o_indirect_h);
         auto z        = operands.unpack(o_z);
         auto b        = operands.unpack(o_batch_idx);

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor      = tensor_operands.unpack(o_tensor);
         auto gpu_sampler = o_tensor.sampler();

         GpuTensor3dMapper mapper(tensor, gpu_sampler);

         auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);

         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);

         for(int i = 0; i < dst->format().h; ++i)
         {
             load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
         }

         load_writer->finalize();
     }

     void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto src      = operands.unpack(src_name);
         auto x        = operands.unpack(x_name);
         auto y        = operands.unpack(y_name);
         auto z        = operands.unpack(z_name);
         auto b        = operands.unpack(batch_index_name);

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor      = tensor_operands.unpack(tensor_name);
         auto gpu_sampler = tensor_name.sampler();

         GpuTensor3dMapper mapper(tensor, gpu_sampler);

         auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);

         // Initialize the constant part
         store_writer->initialize(src, x, z, b);

         int32_t tile_h = src->format().h;

         for(int m0 = tile_h - 1; m0 >= 0; m0--)
         {
             store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
         }

         store_writer->finalize();
     }

     void op_return() override
     {
         _data->code += "return;\n";
     }

     void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto dst    = operands.unpack(o_dst);
         auto x      = operands.unpack(o_x);
         auto y      = operands.unpack(o_y);
         auto x_off  = operands.unpack(o_x_off);
         auto y_off  = operands.unpack(o_y_off);

         TensorOperandUnpacker tensor_operands(_data->arguments);
         auto tensor   = tensor_operands.unpack(o_tensor);

         assert(dst->format().w == 1);
         assert(x->format().w == 1);
         assert(y->format().w == 1);
         assert(x_off->format().w == 1);
         assert(y_off->format().w == 1);
         assert(dst->format().dt == DataType::Int32);
         assert(x->format().dt == DataType::Int32);
         assert(y->format().dt == DataType::Int32);
         assert(x_off->format().dt == DataType::Int32);
         assert(y_off->format().dt == DataType::Int32);

         const std::string width  = tensor->component(TensorComponent::W);
         const std::string height = tensor->component(TensorComponent::H);
         const std::string wxh    = tensor->component(TensorComponent::WxH);
         /*
         int x_s;
         int y_s;
         x_s = (xi_0 + x_k);
         y_s = (yi_0 + y_k);
         mi_0 = x_s + y_s * width + b * widthxheight;
         mi_0 = select(-1, mi_0, x_s >= 0);
         mi_0 = select(-1, mi_0, y_s >= 0);
         mi_0 = select(-1, mi_0, x_s < 128);
         mi_0 = select(-1, mi_0, y_s < 128);
         */
         compound_statement_begin();
         declare_tile("_x_s", TileInfo(DataType::Int32));
         declare_tile("_y_s", TileInfo(DataType::Int32));
         auto x_s = operands.unpack(Operand("_x_s"));
         auto y_s = operands.unpack(Operand("_y_s"));
         for(int i = 0; i < dst->format().h; ++i)
         {
             // x_s = (xi_0 + x_k);
             // y_s = (yi_0 + y_k);
             _data->code += x_s->scalar(0, i).str;
             _data->code += " = (";
             _data->code += x->scalar(0, i).str;
             _data->code += " + ";
             _data->code += x_off->scalar(0, i).str;
             _data->code += ");\n";
             _data->code += y_s->scalar(0, i).str;
             _data->code += " = (";
             _data->code += y->scalar(0, i).str;
             _data->code += " + ";
             _data->code += y_off->scalar(0, i).str;
             _data->code += ");\n";
             // mi_0 = x_s + y_s * width;
             _data->code += dst->scalar(0, i).str;
             _data->code += " = ";
             _data->code += x_s->scalar(0, i).str;
             _data->code += " + ";
             _data->code += y_s->scalar(0, i).str;
             _data->code += " * " + width + ";\n";
             // mi_0 = select(wxh, mi_0, x_s >= 0);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
             _data->code += dst->scalar(0, i).str;
             _data->code += ", ";
             _data->code += x_s->scalar(0, i).str;
             _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, y_s >= 0);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
             _data->code += dst->scalar(0, i).str;
             _data->code += ", ";
             _data->code += y_s->scalar(0, i).str;
             _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, x_s < width);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
             _data->code += dst->scalar(0, i).str;
             _data->code += ", ";
             _data->code += x_s->scalar(0, i).str;
             _data->code += " < ";
             _data->code += width + ");\n";
             // mi_0 = select(wxh, mi_0, y_s < height);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
             _data->code += dst->scalar(0, i).str;
             _data->code += ", ";
             _data->code += y_s->scalar(0, i).str;
             _data->code += " < ";
             _data->code += height + ");\n";
         }
         compound_statement_end();
     }

 private:
     GpuKernelWriterDataHolder* _data { nullptr };
     GpuKernelWriterAttribute * _attr { nullptr };
 };

 /** IGpuKernelWriter factory class */
 class GpuKernelWriterFactory final
 {
 public:
     /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
      *
      * @param[in] gpu GPU target
      *
      * @return IGpuKernelWriter
      */
     static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
     {
         switch(x->programming_language())
         {
             case GpuTargetLanguage::OpenCL:
                 return std::make_unique<ClKernelWriter>(attr, x);
             default:
                 std::cout << "Unsupported Gpu programming language" << std::endl;
                 assert(false);
                 return nullptr;
         }
     }
 };

 inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
 {
     auto tensor = tensor_info_id->shape;

     int32_t dim[3] = {0};

     switch(tensor_format)
     {
         case TensorSamplerFormat::C_W_H:
         dim[0] = tensor[0];
         dim[1] = tensor[1];
         dim[2] = tensor[2];
         break;
         case TensorSamplerFormat::C_WH_1:
         dim[0] = tensor[0];
         dim[1] = tensor[1] * tensor[2];
         dim[2] = 1;
         break;
         default:
         std::cout << "Unsupported tensor format" << std::endl;
         assert(false);
         break;
     }

     return std::min(step, dim[idx]);
 }

 } // namespace prototype
 } // namespace ckw

 #endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H