| /* |
| * Copyright (c) 2023 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H |
| #define CKW_PROTOTYPE_SRC_PROTOTYPE_H |
| |
| #include <algorithm> |
| #include <array> |
| #include <cassert> // assert (to be removed) |
| #include <chrono> |
| #include <cmath> |
| #include <cstdint> // int32_t |
| #include <functional> |
| #include <iostream> // cout (to be removed) |
| #include <map> |
| #include <memory> |
| #include <stdexcept> |
| #include <string> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "ckw/Error.h" |
| #include "ckw/TensorInfo.h" |
| #include "ckw/types/ConvertPolicy.h" |
| #include "ckw/types/DataType.h" |
| #include "ckw/types/Functions.h" |
| #include "ckw/types/GpuTargetLanguage.h" |
| #include "ckw/types/Operators.h" |
| #include "ckw/types/TensorSamplerTypes.h" |
| |
| namespace ckw |
| { |
| namespace prototype |
| { |
| |
| // Dummy data structure for Size2D |
| using Size2D = std::vector<int32_t>; |
| |
| // Dummy Status |
| using Status = void; |
| |
| enum class ComponentType : int32_t |
| { |
| Complex = 0, |
| Simple = 1, |
| Unfusable = 2 |
| }; |
| |
| enum class GpuCompilationSpeed |
| { |
| Fast = 0x00, // fast compilation may increase the latency of the network |
| Slow = 0x01 // slow compilation may decrease the latency of the network |
| }; |
| |
| enum class GpuExtensions |
| { |
| Fp16, |
| Dot8, |
| Mmul, |
| FastMath |
| }; |
| |
| struct TensorInfo |
| { |
| TensorShape shape{ { 0 } }; |
| DataType data_type{ DataType::Unknown }; |
| TensorDataLayout data_layout{ TensorDataLayout::Nhwc }; |
| int32_t id{ -1 }; |
| }; |
| |
| struct ComponentAttribute |
| { |
| GpuCompilationSpeed compilation_speed{ GpuCompilationSpeed::Fast }; |
| bool overwrite_tile{ true }; |
| }; |
| |
| inline std::string data_type_to_cl_type(DataType dt) |
| { |
| switch(dt) |
| { |
| case DataType::Fp32: |
| return "float"; |
| case DataType::Fp16: |
| return "half"; |
| case DataType::Int8: |
| return "char"; |
| case DataType::Uint8: |
| return "uchar"; |
| case DataType::Uint16: |
| return "ushort"; |
| case DataType::Int16: |
| return "short"; |
| case DataType::Uint32: |
| return "uint"; |
| case DataType::Int32: |
| return "int"; |
| case DataType::Bool: |
| return "bool"; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| inline int32_t width_to_cl_vector_size(int32_t width) |
| { |
| switch(width) |
| { |
| case 1: |
| return 1; |
| case 2: |
| return 2; |
| case 3: |
| return 3; |
| case 4: |
| return 4; |
| case 5: |
| case 6: |
| case 7: |
| case 8: |
| return 8; |
| case 9: |
| case 10: |
| case 11: |
| case 12: |
| case 13: |
| case 14: |
| case 15: |
| case 16: |
| return 16; |
| default: |
| assert(false); |
| return 0; |
| } |
| } |
| |
| inline std::string get_cl_data_type(DataType dt, int32_t width) |
| { |
| std::string data_type; |
| int32_t w = width_to_cl_vector_size(width); |
| data_type += data_type_to_cl_type(dt); |
| if(w != 1) |
| { |
| data_type += std::to_string(w); |
| } |
| return data_type; |
| } |
| |
| inline std::string to_opencl_store(int32_t vector_length) |
| { |
| if(vector_length != 1) |
| { |
| return "vstore" + std::to_string(vector_length) + "("; |
| } |
| else |
| { |
| return "*("; |
| } |
| } |
| |
| struct TileInfo |
| { |
| TileInfo() |
| { |
| } |
| |
| TileInfo(DataType dt) |
| : dt(dt), w(1), h(1) |
| { |
| } |
| |
| TileInfo(DataType dt, int32_t width) |
| : dt(dt), w(width), h(1) |
| { |
| } |
| |
| TileInfo(DataType dt, int32_t width, int32_t height) |
| : dt(dt), w(width), h(height) |
| { |
| } |
| |
| DataType dt{ DataType::Unknown }; // Data type of the tile |
| int32_t w{ 0 }; // Width (i.e. c0 - portion of the channels) |
| int32_t h{ 0 }; // Height (i.e. s0 - portion of the spatial dimensions) |
| }; |
| |
| inline std::ostream &operator<<(std::ostream &o, const TileInfo &a) |
| { |
| o << a.w << " x " << a.h; |
| return o; |
| } |
| |
| struct DataTypeAsString |
| { |
| std::string str{ "" }; |
| DataType dt{ DataType::Unknown }; |
| int32_t size{ 1 }; |
| }; |
| |
| struct ValueAsString |
| { |
| std::string str{ "" }; |
| DataTypeAsString type{}; |
| }; |
| |
| // https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c |
| // A Tile is a collection of variables used to express a 2D data. |
| class IScalarTile |
| { |
| public: |
| virtual ~IScalarTile() = default; |
| |
| /** Method to get the scalar variable from a tile |
| * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge |
| * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge |
| * |
| * @return the scalar variable as a string |
| */ |
| virtual ValueAsString scalar(int32_t x, int32_t y) const = 0; |
| |
| /** Method to get the list of underlying variable names used by the tile |
| * |
| * @return the list of variable names |
| */ |
| virtual std::vector<ValueAsString> underlying_source_variables() const = 0; |
| |
| /** Method to get the name of the tile. |
| * |
| * @return the name of the tile |
| */ |
| std::string name() const |
| { |
| return _basename; |
| } |
| |
| /** Method to get the tile format |
| * |
| * @return the format |
| */ |
| TileInfo format() const |
| { |
| return _format; |
| } |
| |
| /** Method to know whether the tile is assignable or not (constant) |
| * |
| * @return true if the tile is assignable |
| */ |
| virtual bool is_assignable() const = 0; |
| |
| /** Method to know whether the tile needs to be declared |
| * |
| * @return true if the tile needs to be declared in the code before being used |
| */ |
| virtual bool need_declaration() const = 0; |
| |
| protected: |
| TileInfo _format{}; // Tile format |
| std::string _basename{ "" }; // Tile name |
| }; |
| |
| // A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context. |
| // The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors |
| class IVectorTile : public IScalarTile |
| { |
| public: |
| virtual ~IVectorTile() = default; |
| |
| /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars. |
| * The user can query the list of supported width for the vectors through preferred_vector_sizes(). |
| * |
| * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge |
| * |
| * @return the vector variable as a string |
| */ |
| virtual ValueAsString vector(int32_t y) const = 0; |
| |
| /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars. |
| * |
| * @return the vector variable as a string |
| */ |
| virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0; |
| /** Method to get the preferred vector sizes. |
| * |
| * @return a vector with the preferred vector sizes |
| */ |
| //virtual std::vector<int32_t> preferred_vector_sizes() const = 0; |
| }; |
| |
| class ClTile : public IVectorTile |
| { |
| public: |
| ClTile(const std::string &name, TileInfo format) |
| { |
| _format = format; |
| _basename = name; |
| } |
| |
| ValueAsString scalar(int32_t x, int32_t y) const override |
| { |
| x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0)); |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| ValueAsString t; |
| t.str = build_variable_name(y); |
| t.type.str = get_cl_data_type(_format.dt, 1); |
| t.type.dt = _format.dt; |
| t.type.size = 1; |
| |
| // Check required because if the width has only one element, we cannot use .s0 |
| if(_format.w != 1) |
| { |
| // Automatic broadcasting |
| t.str += ".s" + std::to_string(x); |
| } |
| |
| return t; |
| } |
| |
| ValueAsString vector(int32_t y) const override |
| { |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| ValueAsString t; |
| t.str = build_variable_name(y); |
| t.type.str = get_cl_data_type(_format.dt, _format.w); |
| t.type.dt = _format.dt; |
| t.type.size = _format.w; |
| return t; |
| } |
| |
| ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override |
| { |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| ValueAsString t; |
| t.str = build_variable_name(y); |
| t.type.str = get_cl_data_type(_format.dt, width); |
| t.type.dt = _format.dt; |
| t.type.size = width; |
| |
| if(_format.w != 1) |
| { |
| t.str += ".s"; |
| for(int i = 0; i < width; ++i) |
| { |
| t.str += to_scalar_hex(x_start + i); |
| } |
| } |
| return t; |
| } |
| |
| std::vector<ValueAsString> underlying_source_variables() const override |
| { |
| std::vector<ValueAsString> vars; |
| for(int32_t y = 0; y < _format.h; ++y) |
| { |
| ValueAsString t; |
| t.str = build_variable_name(y); |
| t.type.str = get_cl_data_type(_format.dt, _format.w); |
| t.type.dt = _format.dt; |
| t.type.size = _format.w; |
| vars.push_back(t); |
| } |
| return vars; |
| } |
| |
| bool is_assignable() const override |
| { |
| return true; |
| } |
| |
| bool need_declaration() const override |
| { |
| return true; |
| } |
| |
| private: |
| std::string build_variable_name(int32_t y) const |
| { |
| std::string var_name = _basename; |
| |
| if(_format.h == 1) |
| { |
| return var_name; |
| } |
| else |
| { |
| var_name += "_"; |
| var_name += std::to_string(y); |
| } |
| |
| return var_name; |
| } |
| |
| std::string to_scalar_hex(int32_t x) const |
| { |
| switch(x) |
| { |
| case 0: |
| case 1: |
| case 2: |
| case 3: |
| case 4: |
| case 5: |
| case 6: |
| case 7: |
| case 8: |
| case 9: |
| return std::to_string(x); |
| case 10: |
| return "A"; |
| case 11: |
| return "B"; |
| case 12: |
| return "C"; |
| case 13: |
| return "D"; |
| case 14: |
| return "E"; |
| case 15: |
| return "F"; |
| default: |
| std::cout << "Unsupported hexadecimal value" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| }; |
| |
| // Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time. |
| class ClConstantTile : public IVectorTile |
| { |
| public: |
| ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt) |
| { |
| _format.w = in[0].size(); |
| _format.h = in.size(); |
| _format.dt = dt; |
| |
| _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w)); |
| |
| for(int32_t y = 0; y < _format.h; ++y) |
| { |
| for(int32_t x = 0; x < _format.w; ++x) |
| { |
| _data[y][x] = in[y][x]; |
| } |
| } |
| } |
| |
| ValueAsString scalar(int32_t x, int32_t y) const override |
| { |
| x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0)); |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| ValueAsString t; |
| t.str = _data[y][x]; |
| t.type.str = get_cl_data_type(_format.dt, 1); |
| t.type.dt = _format.dt; |
| t.type.size = 1; |
| |
| return t; |
| } |
| |
| ValueAsString vector(int32_t y) const override |
| { |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| return vector(0, _format.w, y); |
| } |
| |
| ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override |
| { |
| y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0)); |
| |
| ValueAsString t; |
| t.str = ""; |
| t.type.str = get_cl_data_type(_format.dt, width); |
| t.type.dt = _format.dt; |
| t.type.size = width; |
| |
| if(width > 1) |
| { |
| t.str += "((" + get_cl_data_type(_format.dt, width) + ")("; |
| } |
| |
| int32_t x = x_start; |
| for(; x < width - 1; ++x) |
| { |
| t.str += scalar(x, y).str; |
| t.str += ", "; |
| } |
| t.str += scalar(x, y).str; |
| |
| if(width > 1) |
| { |
| t.str += "))"; |
| } |
| |
| return t; |
| } |
| |
| std::vector<ValueAsString> underlying_source_variables() const override |
| { |
| std::vector<ValueAsString> vars; |
| |
| for(int32_t y = 0; y < _format.h; ++y) |
| { |
| for(int32_t x = 0; x < _format.w; ++x) |
| { |
| ValueAsString t; |
| t.str = _data[y][x]; |
| t.type.str = get_cl_data_type(_format.dt, 1); |
| t.type.dt = _format.dt; |
| t.type.size = 1; |
| vars.push_back(t); |
| } |
| } |
| |
| return vars; |
| } |
| |
| bool is_assignable() const override |
| { |
| return false; |
| } |
| |
| bool need_declaration() const override |
| { |
| return false; |
| } |
| |
| private: |
| std::vector<std::vector<std::string>> _data{}; |
| }; |
| |
| enum class TensorComponentIndex : int32_t |
| { |
| IndexMask = 0x0000000f, |
| }; |
| |
| enum class TensorComponentGroup : int32_t |
| { |
| OffsetFirstElement = 0x00000100, |
| Stride = 0x00001000, |
| Dimension = 0x00010000, |
| FoldedDimension = 0x00100000, |
| Constant = 0x01000000 |
| }; |
| |
| inline std::string to_string(TensorComponentType x) |
| { |
| switch(x) |
| { |
| case TensorComponentType::Unknown: |
| return "Unknown"; |
| case TensorComponentType::OffsetFirstElement: |
| return "OffsetFirstElement"; |
| case TensorComponentType::Stride1: |
| return "Stride1"; |
| case TensorComponentType::Stride2: |
| return "Stride2"; |
| case TensorComponentType::Stride3: |
| return "Stride3"; |
| case TensorComponentType::Stride4: |
| return "Stride4"; |
| case TensorComponentType::Dim0: |
| return "Dim0"; |
| case TensorComponentType::Dim1: |
| return "Dim1"; |
| case TensorComponentType::Dim2: |
| return "Dim2"; |
| case TensorComponentType::Dim3: |
| return "Dim3"; |
| case TensorComponentType::Dim4: |
| return "Dim4"; |
| case TensorComponentType::Dim1xDim2: |
| return "Dim1xDim2"; |
| case TensorComponentType::Dim1xDim2xDim3: |
| return "Dim1xDim2xDim3"; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| class ITensorArgument |
| { |
| public: |
| virtual ~ITensorArgument() = default; |
| |
| /** Method to get the tensor component as a string |
| * |
| * @param[in] x tensor component to query |
| * |
| * @return the tensor component as a string |
| */ |
| virtual std::string component(TensorComponentType x) = 0; |
| |
| /** Method to get the tensor component type declaration as a string |
| * |
| * @return the tensor component type declaration as a string |
| */ |
| virtual std::string component_type_declaration() const = 0; |
| |
| /** Method to get the tensor component data type |
| * |
| * @return the tensor component data type |
| */ |
| virtual DataType component_data_type() const = 0; |
| |
| /** Method to get the tensor component declarations |
| * |
| * @return a vector containing the tensor component declarations |
| */ |
| virtual std::vector<TensorComponentType> component_declarations() const = 0; |
| |
| /** Method to get the name of the tensor argument. |
| * |
| * @return the name of the tensor argument |
| */ |
| std::string name() const |
| { |
| return _basename; |
| } |
| |
| /** Method to get the tensor format |
| * |
| * @return the format |
| */ |
| TensorInfo format() const |
| { |
| return _format; |
| } |
| |
| protected: |
| TensorInfo _format{}; |
| std::string _basename{}; |
| }; |
| |
| enum class GpuTensorStorage : int32_t |
| { |
| Unknown = 0x0000, |
| BufferUint8Ptr = 0x0012, |
| Image2dReadOnly = 0x0020, |
| Image2dWriteOnly = 0x0021, |
| Image3dReadOnly = 0x0030, |
| Image3dWriteOnly = 0x0031 |
| }; |
| |
| inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s) |
| { |
| switch(s) |
| { |
| case TensorStorageType::Unknown: |
| return GpuTensorStorage::Unknown; |
| |
| case TensorStorageType::BufferUint8Ptr: |
| return GpuTensorStorage::BufferUint8Ptr; |
| |
| case TensorStorageType::Texture2dReadOnly: |
| return GpuTensorStorage::Image2dReadOnly; |
| |
| case TensorStorageType::Texture2dWriteOnly: |
| return GpuTensorStorage::Image2dWriteOnly; |
| |
| default: |
| assert(false); |
| return GpuTensorStorage::Unknown; |
| } |
| } |
| |
| inline TensorStorageType to_tensor_storage(GpuTensorStorage s) |
| { |
| switch(s) |
| { |
| case GpuTensorStorage::Unknown: |
| return TensorStorageType::Unknown; |
| |
| case GpuTensorStorage::BufferUint8Ptr: |
| return TensorStorageType::BufferUint8Ptr; |
| |
| case GpuTensorStorage::Image2dReadOnly: |
| return TensorStorageType::Texture2dReadOnly; |
| |
| case GpuTensorStorage::Image2dWriteOnly: |
| return TensorStorageType::Texture2dWriteOnly; |
| |
| default: |
| assert(false); |
| return TensorStorageType::Unknown; |
| } |
| } |
| |
| class IGpuTensorArgument : public ITensorArgument |
| { |
| public: |
| virtual ~IGpuTensorArgument() = default; |
| |
| /** Method to get the tensor storage, which is the underlying storage used to keep the data memory |
| * |
| * @param[in] x tensor storage to query |
| * |
| * @return the tensor storage as a string |
| */ |
| virtual std::string storage(GpuTensorStorage x) = 0; |
| |
| /** Method to get the tensor storage type declaration as a string |
| * |
| * @param[in] x tensor component to query |
| * |
| * @return the tensor storage type declaration as a string |
| */ |
| virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0; |
| |
| /** Method to get the tensor storage declarations |
| * |
| * @return a vector containing the tensor storage declarations |
| */ |
| virtual std::vector<GpuTensorStorage> storage_declarations() const = 0; |
| }; |
| |
| class ClTensorArgument : public IGpuTensorArgument |
| { |
| public: |
| ClTensorArgument(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible) |
| { |
| _basename = name; |
| _format = x; |
| _return_by_value_when_possible = return_by_value_when_possible; |
| } |
| |
| // Methods to override |
| std::string component(TensorComponentType x) override |
| { |
| if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant))) |
| { |
| int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask); |
| return std::to_string(idx - 1); |
| } |
| |
| if(_return_by_value_when_possible) |
| { |
| if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension))) |
| { |
| int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask); |
| return std::to_string(_format.shape[idx]); |
| } |
| |
| if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension))) |
| { |
| switch(x) |
| { |
| case TensorComponentType::Dim1xDim2: |
| return std::to_string(_format.shape[1] * _format.shape[2]); |
| case TensorComponentType::Dim1xDim2xDim3: |
| return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]); |
| default: |
| std::cout << "Unsupported folded dimension" << std::endl; |
| assert(false); |
| } |
| } |
| } |
| |
| if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end()) |
| { |
| _components_required.push_back(x); |
| } |
| |
| return build_component_name(x); |
| } |
| |
| std::string component_type_declaration() const override |
| { |
| return "int"; |
| }; |
| |
| DataType component_data_type() const override |
| { |
| return DataType::Int32; |
| } |
| |
| std::string storage(GpuTensorStorage x) override |
| { |
| if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end()) |
| { |
| _storage_required.push_back(x); |
| } |
| |
| return build_storage_name(x); |
| } |
| |
| std::string storage_type_declaration(GpuTensorStorage x) const override |
| { |
| switch(x) |
| { |
| case GpuTensorStorage::BufferUint8Ptr: |
| return "__global uchar*"; |
| case GpuTensorStorage::Image2dReadOnly: |
| return "__read_only image2d_t"; |
| case GpuTensorStorage::Image2dWriteOnly: |
| return "__write_only image2d_t"; |
| case GpuTensorStorage::Image3dReadOnly: |
| return "__read_only image3d_t "; |
| case GpuTensorStorage::Image3dWriteOnly: |
| return "__write_only image3d_t "; |
| default: |
| std::cout << "Unsupported storage" << std::endl; |
| assert(false); |
| return ""; |
| } |
| }; |
| |
| std::vector<GpuTensorStorage> storage_declarations() const override |
| { |
| return _storage_required; |
| } |
| |
| std::vector<TensorComponentType> component_declarations() const override |
| { |
| return _components_required; |
| } |
| |
| private: |
| std::string build_storage_name(GpuTensorStorage x) const |
| { |
| std::string var_name = _basename; |
| |
| switch(x) |
| { |
| case GpuTensorStorage::BufferUint8Ptr: |
| return var_name + "_ptr"; |
| case GpuTensorStorage::Image2dReadOnly: |
| case GpuTensorStorage::Image2dWriteOnly: |
| return var_name + "_img2d"; |
| case GpuTensorStorage::Image3dReadOnly: |
| case GpuTensorStorage::Image3dWriteOnly: |
| return var_name + "_img3d"; |
| default: |
| std::cout << "Unsupported storage" << std::endl; |
| assert(false); |
| } |
| |
| return var_name; |
| } |
| |
| std::string build_component_name(TensorComponentType x) const |
| { |
| std::string var_name = _basename; |
| |
| switch(x) |
| { |
| case TensorComponentType::OffsetFirstElement: |
| return var_name + "_offset_first_element"; |
| case TensorComponentType::Stride1: |
| return var_name + "_stride1"; |
| case TensorComponentType::Stride2: |
| return var_name + "_stride2"; |
| case TensorComponentType::Stride3: |
| return var_name + "_stride3"; |
| case TensorComponentType::Dim0: |
| return var_name + "_dim0"; |
| case TensorComponentType::Dim1: |
| return var_name + "_dim1"; |
| case TensorComponentType::Dim2: |
| return var_name + "_dim2"; |
| case TensorComponentType::Dim3: |
| return var_name + "_dim3"; |
| case TensorComponentType::Dim1xDim2: |
| return var_name + "_dim1xdim2"; |
| case TensorComponentType::Dim1xDim2xDim3: |
| return var_name + "_dim1xdim2xdim3"; |
| default: |
| std::cout << "Unsupported component" << std::endl; |
| assert(false); |
| } |
| |
| return var_name; |
| } |
| |
| bool _return_by_value_when_possible{ false }; |
| std::vector<GpuTensorStorage> _storage_required{}; |
| std::vector<TensorComponentType> _components_required{}; |
| }; |
| |
| /** |
| * @brief Data structure that contains the declared tiles by the components. |
| * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to |
| * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack |
| * and remove (pop) all the tiles from the level above. |
| * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code. |
| * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions |
| * when declaring tiles among different components. |
| * |
| */ |
| class GpuTileRegistry |
| { |
| public: |
| enum class RegistryTileType |
| { |
| Tile, |
| Link |
| }; |
| |
| using RegistryIdSpace = int32_t; |
| using RegistryLevel = int32_t; |
| using RegistryTileName = std::string; |
| |
| struct RegistryTileTableEntry |
| { |
| RegistryLevel registry_level{ 0 }; |
| std::unique_ptr<IVectorTile> tile_object{ nullptr }; |
| }; |
| |
| struct RegistryTileTypeTableEntry |
| { |
| RegistryTileType tile_type{ RegistryTileType::Tile }; |
| RegistryTileName tile_name{}; |
| RegistryIdSpace registry_idspace{ 0 }; |
| RegistryLevel registry_level{ 0 }; |
| }; |
| |
| using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>; |
| using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>; |
| |
| /** |
| * @brief Construct a new Gpu Tile Registry object |
| * |
| */ |
| GpuTileRegistry() |
| { |
| _language = GpuTargetLanguage::Unknown; |
| } |
| |
| /** |
| * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language |
| * |
| * @param[in] language Gpu programming language to use |
| */ |
| GpuTileRegistry(GpuTargetLanguage language) |
| { |
| _language = language; |
| } |
| |
| /** |
| * @brief Default destructor. Destroy the Gpu Tile Registry object |
| * |
| */ |
| ~GpuTileRegistry() = default; |
| |
| /** |
| * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles. |
| * Therefore, the IdSpace should be set before declaring any tiles. |
| * |
| * @param[in] id The IdSpace id |
| */ |
| void set_IdSpace(int32_t id) |
| { |
| _IdSpace = id; |
| } |
| |
| /** |
| * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles |
| * |
| * @return The IdSpace id |
| */ |
| int32_t IdSpace() const |
| { |
| return _IdSpace; |
| } |
| |
| /** |
| * @brief Gets all the IdSpace declarations defined in the tile registry. |
| * |
| * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations. |
| */ |
| std::vector<int32_t> IdSpace_declarations() const |
| { |
| std::vector<int32_t> x; |
| |
| auto it = _frags.begin(); |
| |
| while(it != _frags.end()) |
| { |
| x.push_back(it->first); |
| |
| it++; |
| } |
| |
| return x; |
| } |
| |
| /** |
| * @brief Declare a tile from a previously created tile |
| */ |
| void insert(const std::string &name, const IVectorTile *frag) |
| { |
| assert(_language == GpuTargetLanguage::OpenCL); |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = name; |
| const std::string var_name = frag->name(); |
| TileInfo format = frag->format(); |
| |
| // First check whether a tile with the same name exists |
| IVectorTile *result = (*this)[key_var_name]; |
| assert(result == nullptr); |
| if(result == nullptr) |
| { |
| std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format); |
| |
| _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); |
| _frags[key_IdSpace][key_var_name].registry_level = _registry_level; |
| |
| _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link; |
| _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; |
| _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; |
| _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; |
| } |
| } |
| |
| /** |
| * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace() |
| * |
| * @note The reference name used for declaring the tile should not be previously used in the IdSpace |
| * |
| * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry. |
| * @param[in] format Tile format use to use |
| */ |
| void insert(const std::string &name, const TileInfo &format) |
| { |
| assert(_language == GpuTargetLanguage::OpenCL); |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = name; |
| const std::string var_name = generate_tile_name(name); |
| |
| // First check whether a tile with the same name exists |
| IVectorTile *result = (*this)[key_var_name]; |
| assert(result == nullptr); |
| if(result == nullptr) |
| { |
| std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format); |
| _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); |
| _frags[key_IdSpace][key_var_name].registry_level = _registry_level; |
| |
| _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; |
| _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; |
| _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; |
| _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; |
| } |
| } |
| |
| /** |
| * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string |
| * |
| * @note The reference name used for declaring the tile should not be previously used in the IdSpace |
| * |
| * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry. |
| * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile |
| * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure |
| * that the data type is aligned with the content of the std::string. |
| */ |
| void insert(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) |
| { |
| assert(_language == GpuTargetLanguage::OpenCL); |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = name; |
| |
| // First check whether a tile with the same name exists |
| IVectorTile *result = (*this)[key_var_name]; |
| assert(result == nullptr); |
| if(result == nullptr) |
| { |
| std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt); |
| _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); |
| _frags[key_IdSpace][key_var_name].registry_level = _registry_level; |
| |
| _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; |
| _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; |
| _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; |
| _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; |
| } |
| } |
| |
| /** |
| * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string |
| * |
| * @note This method can be used to declare temporary tiles that need to be accessed only once. |
| * |
| * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile |
| * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure |
| * that the data type is aligned with what passed with the std::string. |
| * |
| * @return IVectorTile* the anonymous constant tile |
| */ |
| IVectorTile *insert(const std::vector<std::vector<std::string>> &in, DataType dt) |
| { |
| assert(_language == GpuTargetLanguage::OpenCL); |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++); |
| |
| // First check whether a tile with the same name exists |
| IVectorTile *result = (*this)[key_var_name]; |
| assert(result == nullptr); |
| if(result == nullptr) |
| { |
| std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt); |
| _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); |
| _frags[key_IdSpace][key_var_name].registry_level = _registry_level; |
| |
| _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; |
| _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; |
| _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; |
| _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; |
| } |
| |
| return (*this)[key_var_name]; |
| } |
| |
| /** |
| * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user |
| * |
| * @param[in] name The name of the tile to retrieve |
| * @param[in] IdSpace The IdSpace id where to search the tile |
| * |
| * @return IVectorTile* The tile |
| */ |
| IVectorTile *get(const std::string &name, int32_t IdSpace) |
| { |
| const int32_t key_IdSpace = IdSpace; |
| const std::string key_var_name = name; |
| |
| IVectorTile *result = nullptr; |
| auto search_IdSpace = _frags.find(key_IdSpace); |
| if(search_IdSpace != _frags.end()) |
| { |
| auto search_tile = _frags[key_IdSpace].find(key_var_name); |
| if(search_tile != _frags[key_IdSpace].end()) |
| { |
| result = search_tile->second.tile_object.get(); |
| assert(result != nullptr); |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace() |
| * |
| * @param[in] name The name of the tile to retrieve |
| * |
| * @return IVectorTile* The tile |
| */ |
| IVectorTile *operator[](const std::string &name) |
| { |
| return get(name, _IdSpace); |
| } |
| |
| /** |
| * @brief Check whether the tile in the in the IdSpace provided by the user exists |
| * |
| * @param[in] name Name of the tile to search for |
| * @param[in] IdSpace The IdSpace id where to search the tile |
| * |
| * @return true if the tile exists |
| * @return false if the tile does not exist |
| */ |
| bool has_tile(const std::string &name, int32_t IdSpace) const |
| { |
| const int32_t key_IdSpace = IdSpace; |
| const std::string key_var_name = name; |
| |
| // IVectorTile* result = nullptr; |
| auto search_IdSpace = _frags.find(key_IdSpace); |
| |
| return search_IdSpace != _frags.end(); |
| } |
| |
| /** |
| * @brief Check whether the tile within the current IdSpace exists |
| * |
| * @param[in] name Name of the tile to search for |
| * |
| * @return true if the tile exists |
| * @return false if the tile does not exist |
| */ |
| bool has_tile(const std::string &name) const |
| { |
| return has_tile(name, _IdSpace); |
| } |
| |
| /** |
| * @brief Get all the tiles declared within the IdSpace provided by the user |
| * |
| * @param[in] IdSpace IdSpace where to retrieve all the declared tiles |
| * |
| * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user |
| */ |
| std::vector<IVectorTile *> tile_declarations(int32_t IdSpace) |
| { |
| std::vector<IVectorTile *> tiles; |
| |
| std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin(); |
| |
| while(it != _frag_types[IdSpace].end()) |
| { |
| // The following line should be enabled. However, we cannot at this stage |
| // because it used to retrieve the output tile produced by each component. |
| // However, this method should NOT be used to retrieve the output tile |
| //if(it->second.tile_type == RegistryTileType::Tile) |
| { |
| tiles.push_back(get(it->second.tile_name, it->second.registry_idspace)); |
| } |
| it++; |
| } |
| |
| return tiles; |
| } |
| |
| /** |
| * @brief Increase the level of stack. |
| * |
| */ |
| void increment_registry_level() |
| { |
| _registry_level++; |
| } |
| |
| /** |
| * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack. |
| * |
| */ |
| void decrement_registry_level() |
| { |
| assert(_registry_level >= 0); |
| |
| // Remove all variables in the local scope |
| std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin(); |
| |
| while(it != _frags[_IdSpace].end()) |
| { |
| if(it->second.registry_level == _registry_level) |
| { |
| it = _frags[_IdSpace].erase(it); |
| } |
| else |
| { |
| it++; |
| } |
| } |
| |
| std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin(); |
| |
| while(it_type != _frag_types[_IdSpace].end()) |
| { |
| if(it_type->second.registry_level == _registry_level) |
| { |
| it_type = _frag_types[_IdSpace].erase(it_type); |
| } |
| else |
| { |
| it_type++; |
| } |
| } |
| |
| _registry_level--; |
| } |
| |
| /** |
| * @brief Get the level of the stack |
| * |
| */ |
| int32_t level() const |
| { |
| return _registry_level; |
| } |
| |
| private: |
| // This method ensures that the key is unique among different components |
| std::string generate_tile_name(const std::string &name) |
| { |
| assert(_IdSpace >= 0); |
| if(_registry_level == 0) |
| { |
| return "_G" + std::to_string(_IdSpace) + "_" + name; |
| } |
| else |
| { |
| return name; |
| } |
| } |
| |
| RegistryTileTable _frags{}; |
| RegistryTileTypeTable _frag_types{}; |
| RegistryLevel _registry_level{ 0 }; |
| RegistryIdSpace _IdSpace{ -1 }; |
| int32_t _anonymous_frag_count{ 0 }; // Counter used to create the anonymous tiles |
| GpuTargetLanguage _language{ GpuTargetLanguage::Unknown }; // Gpu programming language |
| }; |
| |
| using TensorEntry = std::unique_ptr<IGpuTensorArgument>; |
| |
| /** |
| * @brief Data structure that contains the tensors consumed by the components. |
| * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions |
| * when declaring tensors among different components. |
| * |
| */ |
| class GpuTensorArgumentRegistry |
| { |
| public: |
| /** |
| * @brief Construct a new Gpu Tensor Registry object |
| * |
| */ |
| GpuTensorArgumentRegistry() |
| { |
| _language = GpuTargetLanguage::Unknown; |
| } |
| |
| /** |
| * @brief Construct a new Gpu Tensor Registry object |
| * |
| * @param[in] language Gpu programming language to use |
| */ |
| GpuTensorArgumentRegistry(GpuTargetLanguage language) |
| { |
| _language = language; |
| } |
| |
| /** |
| * @brief Default destructor. Destroy the Gpu Tensor Registry object |
| * |
| */ |
| ~GpuTensorArgumentRegistry() = default; |
| |
| /** |
| * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors. |
| * Therefore, the IdSpace should be set before declaring any tensors. |
| * |
| * @param[in] id The IdSpace id |
| */ |
| void set_IdSpace(int32_t id) |
| { |
| _IdSpace = id; |
| } |
| |
| /** |
| * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors |
| * |
| * @return The IdSpace id |
| */ |
| int32_t IdSpace() const |
| { |
| return _IdSpace; |
| } |
| |
| /** |
| * @brief Gets all the IdSpace declarations defined in the tensor registry. |
| * |
| * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations. |
| */ |
| std::vector<int32_t> IdSpace_declarations() const |
| { |
| std::vector<int32_t> x; |
| |
| auto it = _refs.begin(); |
| |
| while(it != _refs.end()) |
| { |
| x.push_back(it->first); |
| |
| it++; |
| } |
| |
| return x; |
| } |
| |
| /** |
| * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace() |
| * |
| * @note The reference name used for declaring the tensor should not be previously used in the IdSpace |
| * |
| * @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry. |
| * @param[in] x Pair of tensor info and tensor id |
| * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components |
| */ |
| void insert(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible) |
| { |
| assert(_language == GpuTargetLanguage::OpenCL); |
| const int32_t key_IdSpace = _IdSpace; |
| const int32_t tensor_id = x.id; |
| const std::string key_var_name = name; |
| const std::string var_name = generate_tensor_name(name, tensor_id); |
| |
| // First, check whether the tensor has already a reference. If so, trigger an assert |
| assert(!has_tensor_argument(name)); |
| |
| // Check whether a tensor with that tensorID exists |
| auto result = _tensor_arguments.find(tensor_id); |
| if(result == _tensor_arguments.end()) |
| { |
| // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference |
| std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, |
| return_by_value_when_possible); |
| _tensor_arguments[tensor_id] = std::move(arg); |
| } |
| |
| _refs[key_IdSpace][key_var_name] = tensor_id; |
| } |
| |
| /** |
| * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace() |
| * |
| * @param[in] name The name of the tensor to retrieve |
| * |
| * @return IGpuTensor* The tensor |
| */ |
| IGpuTensorArgument *operator[](const std::string &name) |
| { |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = name; |
| |
| IGpuTensorArgument *result = nullptr; |
| auto search_IdSpace = _refs.find(key_IdSpace); |
| if(search_IdSpace != _refs.end()) |
| { |
| auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); |
| |
| if(search_tensor_id != _refs[key_IdSpace].end()) |
| { |
| const int32_t tensor_id = search_tensor_id->second; |
| auto search_tensor_argument = _tensor_arguments.find(tensor_id); |
| if(search_tensor_argument != _tensor_arguments.end()) |
| { |
| result = search_tensor_argument->second.get(); |
| } |
| assert(result != nullptr); |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * @brief Get all the tensors declared in the IdSpace provided by the user |
| * |
| * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors |
| */ |
| std::vector<IGpuTensorArgument *> tensor_argument_declarations() |
| { |
| std::vector<IGpuTensorArgument *> args; |
| |
| auto it = _tensor_arguments.begin(); |
| |
| while(it != _tensor_arguments.end()) |
| { |
| args.push_back(it->second.get()); |
| it++; |
| } |
| |
| return args; |
| } |
| |
| /** |
| * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists |
| * |
| * @param[in] name Name of the tensor argument to search for |
| * |
| * @return true if the tensor argument exists |
| * @return false if the tensor argument does not exist |
| */ |
| bool has_tensor_argument(const std::string &name) |
| { |
| const int32_t key_IdSpace = _IdSpace; |
| const std::string key_var_name = name; |
| |
| auto search_IdSpace = _refs.find(key_IdSpace); |
| |
| if(search_IdSpace != _refs.end()) |
| { |
| auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); |
| |
| return search_tensor_id != _refs[key_IdSpace].end(); |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| /** |
| * @brief Check whether the tensor argument is in the the IdSpace provided by the user |
| * |
| * @param[in] name Name of the tensor argument to search for |
| * @param[in] IdSpace The IdSpace id where to search the tensor argument |
| * |
| * @return true if the tile exists |
| * @return false if the tile does not exist |
| */ |
| bool has_tensor_argument(const std::string &name, int32_t IdSpace) |
| { |
| const int32_t key_IdSpace = IdSpace; |
| const std::string key_var_name = name; |
| |
| auto search_IdSpace = _refs.find(key_IdSpace); |
| |
| if(search_IdSpace != _refs.end()) |
| { |
| auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); |
| |
| return search_tensor_id != _refs[key_IdSpace].end(); |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| private: |
| // This method ensures that the key is unique among different components |
| std::string generate_tensor_name(const std::string &name, int32_t tensor_id) |
| { |
| assert(tensor_id >= 0); |
| |
| return name + std::to_string(tensor_id); |
| } |
| |
| std::map<int32_t, TensorEntry> _tensor_arguments{}; |
| std::map<int32_t, std::map<std::string, int32_t>> _refs{}; |
| int32_t _IdSpace{ -1 }; |
| GpuTargetLanguage _language{ GpuTargetLanguage::Unknown }; // Gpu programming language |
| }; |
| |
| enum class OpType : int32_t |
| { |
| Elementwise = 0x0000, |
| Relational = 0x1000, |
| Algebra = 0x2000 |
| }; |
| |
| inline std::string to_string(AssignmentOp op) |
| { |
| switch(op) |
| { |
| case AssignmentOp::Decrement: |
| return "-="; |
| case AssignmentOp::Increment: |
| return "+="; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| inline std::string to_string(UnaryOp op) |
| { |
| switch(op) |
| { |
| case UnaryOp::LogicalNot: |
| return "!"; |
| case UnaryOp::BitwiseNot: |
| return "~"; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| inline std::string to_string(BinaryOp op) |
| { |
| switch(op) |
| { |
| case BinaryOp::Add: |
| return "+"; |
| case BinaryOp::Sub: |
| return "-"; |
| case BinaryOp::Mul: |
| return "*"; |
| case BinaryOp::Div: |
| return "/"; |
| case BinaryOp::Mod: |
| return "%"; |
| case BinaryOp::Equal: |
| return "=="; |
| case BinaryOp::Less: |
| return "<"; |
| case BinaryOp::LessEqual: |
| return "<="; |
| case BinaryOp::Greater: |
| return ">"; |
| case BinaryOp::GreaterEqual: |
| return ">="; |
| case BinaryOp::LogicalAnd: |
| return "&&"; |
| case BinaryOp::LogicalOr: |
| return "||"; |
| case BinaryOp::BitwiseXOR: |
| return "^"; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| inline std::string binary_op_string(BinaryOp op) |
| { |
| switch(op) |
| { |
| case BinaryOp::Add: |
| return "add"; |
| case BinaryOp::Sub: |
| return "sub"; |
| case BinaryOp::Mul: |
| return "mul"; |
| case BinaryOp::Div: |
| return "div"; |
| case BinaryOp::Mod: |
| return "mod"; |
| case BinaryOp::Equal: |
| return "eq"; |
| case BinaryOp::Less: |
| return "gt"; |
| case BinaryOp::LessEqual: |
| return "gteq"; |
| case BinaryOp::Greater: |
| return "lt"; |
| case BinaryOp::GreaterEqual: |
| return "lte"; |
| default: |
| assert(false); |
| return ""; |
| } |
| } |
| |
| enum class OperandType : int32_t |
| { |
| Unknown = 0x00000000, |
| ScalarFp32 = 0x00001011, // Immediate scalar tile |
| ScalarFp16 = 0x00001012, // Immediate scalar tile |
| ScalarInt32 = 0x00001021, // Immediate scalar tile |
| ScalarInt16 = 0x00001022, // Immediate scalar tile |
| ScalarInt8 = 0x00001024, // Immediate scalar tile |
| ScalarUInt32 = 0x00001031, // Immediate scalar tile |
| ScalarUInt16 = 0x00001032, // Immediate scalar tile |
| ScalarUInt8 = 0x00001034, // Immediate scalar tile |
| ScalarBool = 0x00001041, // Immediate scalar tile |
| ScalarTile = 0x00001050, // Scalar from a tile |
| Tile = 0x00010000, // Tile |
| TensorStride1 = 0x00100001, // Tensor component |
| TensorStride2 = 0x00100002, // Tensor component |
| TensorStride3 = 0x00100003, // Tensor component |
| TensorStride4 = 0x00100004, // Tensor component |
| TensorDim0 = 0x00100010, // Tensor component |
| TensorDim1 = 0x00100020, // Tensor component |
| TensorDim2 = 0x00100030, // Tensor component |
| TensorDim3 = 0x00100040, // Tensor component |
| TensorDim4 = 0x00100050, // Tensor component |
| TensorC = 0x00100010, // Tensor component |
| TensorW = 0x00100020, // Tensor component |
| TensorH = 0x00100030, // Tensor component |
| TensorD = 0x00100040, // Tensor component |
| TensorN = 0x00100050, // Tensor component |
| TensorDim1xDim2 = 0x00100100, // Tensor component |
| TensorDim1xDim2xDim3 = 0x00100200, // Tensor component |
| TensorWxH = 0x00100300, // Tensor component |
| TensorWxHxD = 0x00100400, // Tensor component |
| TensorDataOffset = 0x00100500, // Tensor component |
| }; |
| |
| struct ScalarTileCoord |
| { |
| ScalarTileCoord() |
| { |
| } |
| |
| ScalarTileCoord(int32_t x0, int32_t y0) |
| : x(x0), y(y0) |
| { |
| } |
| |
| int32_t x{ -1 }; |
| int32_t y{ -1 }; |
| }; |
| |
| /** |
| * @brief Operand class. This object is used to pass the operands to the operations performed by the writer. |
| * Operand can be of three types: |
| * -# Scalar immediate: constant expression |
| * -# Tile: A tile |
| * -# Tensor component: A component (scalar) of a tensor |
| * |
| */ |
| class Operand |
| { |
| public: |
| Operand(const std::string &val) |
| { |
| _str = val; |
| _type = OperandType::Tile; |
| } |
| |
| Operand(const std::string &val, const ScalarTileCoord &coord) |
| { |
| _str = val; |
| _type = OperandType::ScalarTile; |
| _coord = coord; |
| } |
| |
| Operand(const std::string &val, OperandType type) |
| { |
| _str = val; |
| _type = type; |
| } |
| |
| Operand(const Operand &t) |
| { |
| _str = t.value(); |
| _type = t.type(); |
| } |
| |
| Operand &operator=(const Operand &t) |
| { |
| _str = t.value(); |
| _type = t.type(); |
| _coord = t.scalar_tile_coordinate(); |
| return *this; |
| } |
| |
| std::string value() const |
| { |
| return _str; |
| } |
| |
| OperandType type() const |
| { |
| return _type; |
| } |
| |
| ScalarTileCoord scalar_tile_coordinate() const |
| { |
| return _coord; |
| } |
| |
| private: |
| std::string _str{}; |
| OperandType _type{ OperandType::Unknown }; |
| ScalarTileCoord _coord{}; |
| }; |
| |
| using GpuSamplerTensorStorage = GpuTensorStorage; |
| |
| struct GpuSampler |
| { |
| GpuSampler() = default; |
| |
| TensorSamplerFormat format{ TensorSamplerFormat::Unknown }; |
| GpuSamplerTensorStorage storage{ GpuSamplerTensorStorage::Unknown }; |
| TensorSamplerAddressModeX address_mode_x{ TensorSamplerAddressModeX::Unknown }; |
| TensorSamplerAddressModeY address_mode_y{ TensorSamplerAddressModeY::Unknown }; |
| TensorSamplerAddressModeZ address_mode_z{ TensorSamplerAddressModeZ::Unknown }; |
| }; |
| |
| inline GpuSampler |
| create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, |
| int32_t step_z) |
| { |
| CKW_UNUSED(step_x, step_y, step_z); |
| |
| auto tensor = tensor_info_id->shape; |
| |
| GpuSampler dst_sampler; |
| dst_sampler.format = sampler.format; |
| dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr; |
| dst_sampler.address_mode_x = sampler.address_mode_x; |
| dst_sampler.address_mode_y = sampler.address_mode_y; |
| dst_sampler.address_mode_z = sampler.address_mode_z; |
| |
| int32_t dim_x = 0; |
| int32_t dim_y = 0; |
| int32_t dim_z = 0; |
| |
| switch(sampler.format) |
| { |
| case TensorSamplerFormat::C_W_H: |
| dim_x = tensor[0]; |
| dim_y = tensor[1]; |
| dim_z = tensor[2]; |
| break; |
| case TensorSamplerFormat::C_WH_1: |
| dim_x = tensor[0]; |
| dim_y = tensor[1] * tensor[2]; |
| dim_z = 1; |
| break; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| break; |
| } |
| |
| if(dim_x == 1) |
| { |
| assert(step_x == 1); |
| dst_sampler.address_mode_x = TensorSamplerAddressModeX::None; |
| } |
| |
| if(dim_y == 1) |
| { |
| assert(step_y == 1); |
| dst_sampler.address_mode_y = TensorSamplerAddressModeY::None; |
| } |
| |
| if(dim_z == 1) |
| { |
| assert(step_z == 1); |
| dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None; |
| } |
| |
| return dst_sampler; |
| } |
| |
| class GpuOutputSampler |
| { |
| public: |
| GpuOutputSampler() = default; |
| |
| /** |
| * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once |
| * by the root component. Once initialized, all simpler components will need to used this sampler |
| * or a broadcasted version of it |
| * |
| * @param[in] sampler GpuSampler |
| * @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile! |
| * @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile! |
| * @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile! |
| */ |
| void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, |
| TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z) |
| { |
| assert(_is_initialized == false); |
| |
| _step_x = step_x; |
| _step_y = step_y; |
| _step_z = step_z; |
| _tensor_info_id = tensor_info_id; |
| _sampler = create_sampler(tensor_storage, tensor_format); |
| _is_initialized = true; |
| }; |
| |
| GpuSampler sampler() const |
| { |
| return _sampler; |
| }; |
| |
| int32_t step_x() const |
| { |
| return _step_x; |
| }; |
| |
| int32_t step_y() const |
| { |
| return _step_y; |
| }; |
| |
| int32_t step_z() const |
| { |
| return _step_z; |
| }; |
| |
| private: |
| GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format) |
| { |
| // Output can only be in output mode |
| assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly); |
| assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly); |
| |
| auto tensor = _tensor_info_id->shape; |
| |
| GpuSampler sampler; |
| sampler.format = tensor_format; |
| sampler.storage = tensor_storage; |
| sampler.address_mode_x = TensorSamplerAddressModeX::None; |
| sampler.address_mode_y = TensorSamplerAddressModeY::None; |
| sampler.address_mode_z = TensorSamplerAddressModeZ::None; |
| |
| // In the case of texture, we do not need any special checks at the border |
| if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr) |
| { |
| int32_t dim_x = 0; |
| int32_t dim_y = 0; |
| int32_t dim_z = 0; |
| |
| switch(tensor_format) |
| { |
| case TensorSamplerFormat::C_W_H: |
| dim_x = tensor[0]; |
| dim_y = tensor[1]; |
| dim_z = tensor[2]; |
| break; |
| case TensorSamplerFormat::C_WH_1: |
| dim_x = tensor[0]; |
| dim_y = tensor[1] * tensor[2]; |
| dim_z = 1; |
| break; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| break; |
| } |
| |
| if((dim_x % _step_x) != 0 && dim_x != 1) |
| { |
| sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin; |
| } |
| |
| if((dim_y % _step_y) != 0 && dim_y != 1) |
| { |
| sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly; |
| } |
| |
| if((dim_z % _step_z) != 0 && dim_z != 1) |
| { |
| sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly; |
| } |
| } |
| |
| return sampler; |
| } |
| |
| GpuSampler _sampler{}; // GpuSampler |
| int32_t _step_x{ 1 }; |
| int32_t _step_y{ 1 }; |
| int32_t _step_z{ 1 }; |
| const TensorInfo *_tensor_info_id{ nullptr }; |
| bool _is_initialized{ false }; |
| }; |
| |
| /** |
| * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer. |
| */ |
| class TensorOperand |
| { |
| public: |
| TensorOperand(const std::string &val, GpuSampler sampler) |
| : _str(val), _sampler(sampler) |
| { |
| } |
| |
| TensorOperand &operator=(const TensorOperand &t) |
| { |
| _str = t.value(); |
| _sampler = t.sampler(); |
| return *this; |
| } |
| |
| std::string value() const |
| { |
| return _str; |
| } |
| |
| GpuSampler sampler() const |
| { |
| return _sampler; |
| } |
| |
| private: |
| std::string _str{}; |
| GpuSampler _sampler{}; |
| }; |
| |
| /** |
| * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer |
| * This data structure must be initialized before being passed to the Gpu Kernel Writer |
| * |
| */ |
| class GpuKernelWriterDataHolder |
| { |
| public: |
| /** |
| * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store |
| * the GPU target and target specific capabilities (extensions). For now, we just initialize the |
| * programming language |
| * |
| * @param[in] language Gpu programming language to use |
| */ |
| GpuKernelWriterDataHolder(GpuTargetLanguage language) |
| : tiles(language), arguments(language), code(""), _language(language) |
| { |
| } |
| |
| /** |
| * @brief Get the Gpu programming language used |
| * |
| * @return GpuTargetLanguage the Gpu programming language |
| */ |
| GpuTargetLanguage programming_language() const |
| { |
| return _language; |
| } |
| |
| /** |
| * @brief @ref GpuTileRegistry |
| * |
| */ |
| GpuTileRegistry tiles{}; |
| /** |
| * @brief @ref GpuTensorArgumentRegistry |
| * |
| */ |
| GpuTensorArgumentRegistry arguments{}; |
| /** |
| * @brief @ref GpuOutputSampler. |
| * |
| */ |
| GpuOutputSampler output_sampler{}; |
| /** |
| * @brief Source code |
| * |
| */ |
| std::string code{}; |
| |
| // GpuExtensionRegistry extensions{}; |
| private: |
| GpuTargetLanguage _language; |
| }; |
| |
| struct LWS |
| { |
| int32_t x{ 1 }; |
| int32_t y{ 1 }; |
| int32_t z{ 1 }; |
| }; |
| |
| /** |
| * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker |
| * declare an anonymous tile in the tile registry. |
| */ |
| class OperandUnpacker |
| { |
| public: |
| OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) |
| : _tiles(tiles), _arguments(arguments) |
| { |
| // Increase the level of the stack to allocate possible temporary tiles |
| _tiles.increment_registry_level(); |
| }; |
| |
| ~OperandUnpacker() |
| { |
| // Decrease the level of the stack to deallocate any temporary tiles |
| _tiles.decrement_registry_level(); |
| } |
| |
| IVectorTile *unpack(const Operand &src) |
| { |
| // Get the tile |
| if(src.type() == OperandType::Tile) |
| { |
| assert(_tiles.has_tile(src.value())); |
| return _tiles[src.value()]; |
| } |
| // Create an anonymous tile with a constant |
| else if(static_cast<int32_t>(src.type()) & 0x00001000) |
| { |
| if(src.type() == OperandType::ScalarTile) |
| { |
| ScalarTileCoord coord = src.scalar_tile_coordinate(); |
| assert(_tiles.has_tile(src.value())); |
| assert(coord.x >= 0); |
| assert(coord.y >= 0); |
| auto val = _tiles[src.value()]->scalar(coord.x, coord.y); |
| return _tiles.insert({ { { val.str } } }, val.type.dt); |
| } |
| else |
| { |
| return _tiles.insert({ { { src.value() } } }, to_tile_data_type(src.type())); |
| } |
| } |
| // Create an anonymous tile with the tensor component |
| else |
| { |
| assert(_arguments.has_tensor_argument(src.value())); |
| auto x = _arguments[src.value()]; |
| const std::string val = x->component(to_tensor_component(src.type())); |
| const DataType dt = x->component_data_type(); |
| return _tiles.insert({ { { val } } }, dt); |
| } |
| } |
| |
| private: |
| DataType to_tile_data_type(OperandType x) |
| { |
| return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff); |
| } |
| |
| TensorComponentType to_tensor_component(OperandType x) |
| { |
| switch(x) |
| { |
| case OperandType::TensorDim0: |
| return TensorComponentType::Dim0; |
| case OperandType::TensorDim1: |
| return TensorComponentType::Dim1; |
| case OperandType::TensorDim2: |
| return TensorComponentType::Dim2; |
| case OperandType::TensorDim3: |
| return TensorComponentType::Dim3; |
| case OperandType::TensorDim4: |
| return TensorComponentType::Dim4; |
| case OperandType::TensorStride1: |
| return TensorComponentType::Stride1; |
| case OperandType::TensorStride2: |
| return TensorComponentType::Stride2; |
| case OperandType::TensorStride3: |
| return TensorComponentType::Stride3; |
| case OperandType::TensorStride4: |
| return TensorComponentType::Stride4; |
| case OperandType::TensorDim1xDim2: |
| return TensorComponentType::Dim1xDim2; |
| case OperandType::TensorDim1xDim2xDim3: |
| return TensorComponentType::Dim1xDim2xDim3; |
| case OperandType::TensorDataOffset: |
| return TensorComponentType::OffsetFirstElement; |
| default: |
| assert(false); |
| return TensorComponentType::Unknown; |
| } |
| } |
| |
| GpuTileRegistry &_tiles; |
| GpuTensorArgumentRegistry &_arguments; |
| }; |
| |
| /** |
| * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker |
| * declare an anonymous tile in the tile registry. |
| * Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure. |
| */ |
| class TensorOperandUnpacker |
| { |
| public: |
| TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) |
| : _arguments(arguments){}; |
| |
| IGpuTensorArgument *unpack(const TensorOperand &src) |
| { |
| assert(_arguments.has_tensor_argument(src.value())); |
| return _arguments[src.value()]; |
| } |
| |
| private: |
| GpuTensorArgumentRegistry &_arguments; |
| }; |
| |
| /** |
| * @brief The GpuKernel will be used in three occasions (stages): |
| * #- Compilation stage |
| * #- Tuning stage |
| * #- Dispatch stage |
| */ |
| struct GpuKernel |
| { |
| // Compilation stage |
| std::string code{}; // Source code, required for the compilation stage |
| std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage |
| // Tuning stage |
| std::string config_id{}; // Unique id, required for the tuning stage |
| std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage |
| // Dispatch stage |
| GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage |
| std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages; // List of tensor storages, required for the dispatch stage |
| std::vector<std::pair<int32_t, TensorComponentType>> list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage) |
| }; |
| |
| // Generate all extension pragmas (hardcoded for now) |
| inline std::string generate_extensions() |
| { |
| std::string ext = R"( |
| #if defined(cl_khr_fp16) |
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable |
| #endif // defined(cl_khr_fp16) |
| |
| #if defined(cl_arm_integer_dot_product_int8) |
| #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable |
| #endif // defined(cl_arm_integer_dot_product_int8) |
| |
| #if defined(cl_arm_integer_dot_product_accumulate_int8) |
| #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable |
| #endif // defined(cl_arm_integer_dot_product_accumulate_int8) |
| |
| #if defined(cl_arm_printf) |
| #pragma OPENCL EXTENSION cl_arm_printf : enable |
| #endif // defined(cl_arm_printf); |
| )"; |
| return ext; |
| } |
| |
| // This function should produce an object with the source |
| inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name) |
| { |
| std::string code; |
| code += generate_extensions(); |
| code += "__kernel void "; |
| code += name; |
| code += "(\n"; |
| |
| auto IdSpaces = in.arguments.IdSpace_declarations(); |
| |
| std::vector<std::string> arg_str; |
| |
| auto tensor_args = in.arguments.tensor_argument_declarations(); |
| |
| for(auto &i : tensor_args) |
| { |
| // For each tensor used, get the storage and tensor components |
| auto storages = i->storage_declarations(); |
| auto components = i->component_declarations(); |
| |
| for(auto &y : storages) |
| { |
| std::string str; |
| str += i->storage_type_declaration(y); |
| str += " "; |
| str += i->storage(y); |
| arg_str.push_back(str); |
| } |
| |
| for(auto &y : components) |
| { |
| std::string str; |
| str += i->component_type_declaration(); |
| str += " "; |
| str += i->component(y); |
| arg_str.push_back(str); |
| } |
| } |
| |
| for(size_t i = 0; i < arg_str.size(); ++i) |
| { |
| code += arg_str[i]; |
| if(i + 1 < arg_str.size()) |
| { |
| code += ",\n"; |
| } |
| } |
| |
| code += ")\n"; |
| code += "{\n"; |
| code += in.code; |
| code += "}\n"; |
| |
| return code; |
| } |
| |
| /** |
| * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know |
| * how to reduce the dimensionality of a tensor |
| * |
| */ |
| class GpuTensor3dMapper |
| { |
| public: |
| GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) |
| : _sampler(sampler), _tensor(tensor){}; |
| |
| std::string tensor_component_x() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Dim0); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string tensor_component_y() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| return _tensor->component(TensorComponentType::Dim1xDim2); |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Dim1); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string tensor_component_z() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| return "1"; |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Dim2); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string tensor_component_stride_y() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Stride1); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string tensor_component_stride_z() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| return "0"; |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Stride2); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string tensor_component_stride_batch() const |
| { |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| case TensorSamplerFormat::C_W_H: |
| return _tensor->component(TensorComponentType::Stride3); |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| bool is_one_component_x() const |
| { |
| auto t = _tensor->format(); |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| case TensorSamplerFormat::C_W_H: |
| return t.shape[0] == 1; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| bool is_one_component_y() const |
| { |
| auto t = _tensor->format(); |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| return (t.shape[1] * t.shape[2]) == 1; |
| case TensorSamplerFormat::C_W_H: |
| return t.shape[1] == 1; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| bool is_one_component_z() const |
| { |
| auto t = _tensor->format(); |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| return true; |
| case TensorSamplerFormat::C_W_H: |
| return t.shape[2] == 1; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| bool is_one_component_batch() const |
| { |
| auto t = _tensor->format(); |
| const auto format = _sampler.format; |
| switch(format) |
| { |
| case TensorSamplerFormat::C_WH_1: |
| case TensorSamplerFormat::C_W_H: |
| return t.shape[3] == 1; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| GpuSampler gpu_sampler() const |
| { |
| return _sampler; |
| } |
| |
| IGpuTensorArgument *tensor_argument() const |
| { |
| return _tensor; |
| } |
| |
| private: |
| GpuSampler _sampler; |
| IGpuTensorArgument *_tensor; |
| }; |
| |
| struct GpuKernelWriterAttribute |
| { |
| bool return_tensor_component_by_value{ false }; |
| }; |
| |
| enum class RoundingMode |
| { |
| None, |
| Rte, |
| Rtz, |
| Rtp, |
| Rtn |
| }; |
| |
| // https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html |
| class IGpuKernelWriter |
| { |
| public: |
| virtual ~IGpuKernelWriter() = default; |
| |
| virtual void set_IdSpace(int32_t id) = 0; |
| |
| virtual void import_tile(const std::string &dst, const IVectorTile *src) = 0; |
| |
| virtual void declare_argument(const std::string &name, const TensorInfo &tensor) = 0; |
| |
| virtual void declare_tile(const std::string &name, const TileInfo &info) = 0; |
| |
| virtual void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0; |
| |
| virtual void write_text(const std::string &x) = 0; |
| |
| virtual void compound_statement_begin() = 0; |
| |
| virtual void compound_statement_end() = 0; |
| |
| // Operations |
| virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0; |
| |
| virtual void op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0; |
| |
| virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0; |
| |
| virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0; |
| |
| virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0; |
| |
| virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; |
| |
| virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0; |
| |
| virtual void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0; |
| |
| virtual void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) = 0; |
| |
| virtual void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) = 0; |
| |
| virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; |
| |
| virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; |
| |
| virtual void op_else_header() = 0; |
| |
| virtual void op_for_loop_header(const Operand &var_name, BinaryOp cond_op, const Operand &cond_value, const Operand &update_var, AssignmentOp update_op, const Operand &update_value) = 0; |
| |
| virtual void op_load_indirect(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y_indirect, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0; |
| |
| virtual void op_load_immediate(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32), const Operand &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0; |
| |
| virtual void op_store_immediate(const TensorOperand &tensor, const Operand &src, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0; |
| |
| virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0; |
| |
| virtual void op_return() = 0; |
| |
| // Utils |
| // It is the process of converting |
| virtual void util_get_indirect_buffer(const Operand &dst, const TensorOperand &tensor, const Operand &x, |
| const Operand &y, const Operand &x_off, const Operand &y_off) = 0; |
| }; |
| |
| enum class GpuLoadStoreType |
| { |
| Load = 1, |
| Store = 2 |
| }; |
| |
| class IGpuLoadStoreHelperWriter |
| { |
| public: |
| IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) |
| : _writer(x), _mapper(mapper), _type(type) |
| { |
| } |
| |
| IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default; |
| |
| IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default; |
| |
| virtual ~IGpuLoadStoreHelperWriter() = default; |
| |
| virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0; |
| |
| virtual void write(const std::pair<int32_t, std::string> &y) = 0; |
| |
| virtual void finalize() = 0; |
| |
| protected: |
| IGpuKernelWriter *_writer; |
| GpuTensor3dMapper _mapper; |
| GpuLoadStoreType _type; |
| }; |
| |
| class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter |
| { |
| public: |
| ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) |
| : IGpuLoadStoreHelperWriter(x, mapper, type) |
| { |
| } |
| |
| ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default; |
| |
| ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default; |
| |
| static bool |
| validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst) |
| { |
| CKW_UNUSED(x, type, dst); |
| |
| if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr) |
| { |
| return false; |
| } |
| return true; |
| } |
| |
| void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override |
| { |
| assert(validate(_writer, _mapper, _type, dst)); |
| |
| _dst = dst; |
| _ls_width_full = dst->format().w; |
| |
| _coord_x = x->scalar(0, 0).str; |
| _coord_z = z->scalar(0, 0).str; |
| _coord_b = b->scalar(0, 0).str; |
| _coord_orig_z = _coord_z; |
| |
| out_of_bound_initialize_x(_coord_x); |
| out_of_bound_initialize_z(_coord_z); |
| |
| /* |
| meaning of else: |
| - x: partial load/store |
| - y: no load/store operation |
| - z: no load/store operation |
| if(x) |
| { |
| if(z) |
| { |
| if(y) |
| { |
| // full load/store width |
| } |
| else |
| { |
| // no load/store |
| } |
| } |
| else |
| { |
| // no load/store |
| } |
| } |
| else |
| { |
| if(z) |
| { |
| if(y) |
| { |
| // partial load/store width |
| } |
| else |
| { |
| // no load/store |
| } |
| } |
| else |
| { |
| // no load/store |
| } |
| } |
| */ |
| } |
| |
| void write(const std::pair<int32_t, std::string> &y) override |
| { |
| int32_t idx_y = y.first; |
| std::string coord_y = y.second; |
| |
| // The only check required is on Y. |
| out_of_bound_initialize_y(coord_y); |
| |
| const std::string dst = _dst->vector(idx_y).str; |
| const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b); |
| const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address); |
| |
| _writer->write_text(ls_buf); |
| _writer->write_text(";\n"); |
| |
| out_of_bound_finalize_y(dst); |
| |
| // The left over load/store will be written in the finalize stage |
| if(_ls_width_part.size() != 0) |
| { |
| int32_t w = 0; |
| for(auto &p : _ls_width_part) |
| { |
| const std::string dst0 = _dst->vector(w, p, idx_y).str; |
| const std::string coord_x = _coord_x + " + " + std::to_string(w); |
| const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b); |
| const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address); |
| _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0)); |
| |
| w += p; |
| } |
| } |
| } |
| |
| void finalize() override |
| { |
| out_of_bound_finalize_z(); |
| out_of_bound_finalize_x(); |
| } |
| |
| private: |
| IVectorTile *_dst{ nullptr }; |
| int32_t _ls_width_full{ 0 }; |
| std::vector<int32_t> _ls_width_part{}; |
| std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{}; |
| std::string _coord_x{}; |
| std::string _coord_z{}; |
| std::string _coord_orig_z{}; |
| std::string _coord_b{}; |
| |
| void out_of_bound_initialize_x(std::string &coord) |
| { |
| if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) |
| { |
| auto tensor_format = _mapper.tensor_argument()->format(); |
| auto shape = tensor_format.shape; |
| |
| _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full); |
| if(_ls_width_part.size() != 0) |
| { |
| _writer->write_text("if(" + coord + " > 0)\n"); |
| _writer->compound_statement_begin(); |
| } |
| } |
| }; |
| |
| void out_of_bound_finalize_x() |
| { |
| if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) |
| { |
| if(_ls_width_part.size() != 0) |
| { |
| _writer->compound_statement_end(); |
| _writer->write_text("else\n"); |
| _writer->compound_statement_begin(); |
| |
| out_of_bound_initialize_z(_coord_orig_z); |
| for(auto &i : _leftovers_x) |
| { |
| out_of_bound_initialize_y(i.first.second); |
| _writer->write_text(i.second); |
| _writer->write_text(";\n"); |
| out_of_bound_finalize_y(i.first.first); |
| } |
| out_of_bound_finalize_z(); |
| _writer->compound_statement_end(); |
| } |
| } |
| }; |
| |
| void out_of_bound_initialize_y(std::string &coord) |
| { |
| std::string max = ""; |
| |
| const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::Skip: |
| case TensorSamplerAddressModeY::ClampToBorder: |
| // NOTE: This line should not be moved outside of the switch statement. |
| // The reason for that is because when we query the component, the component is marked as used |
| // and added to the list of arguments of the kernel. Since, not in all cases this component is required, |
| // we should request the component only when used |
| max = _mapper.tensor_component_y(); |
| _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::SkipMinEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: |
| _writer->write_text("if(" + coord + " >= 0)\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::SkipMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: |
| max = _mapper.tensor_component_y(); |
| _writer->write_text("if(" + coord + " < " + max + ")\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::ClampToNearest: |
| max = _mapper.tensor_component_y(); |
| coord = "clamp(" + coord + ", 0, " + max + " - 1)"; |
| break; |
| case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: |
| max = _mapper.tensor_component_y(); |
| coord = "min(" + coord + ", " + max + " - 1)"; |
| break; |
| case TensorSamplerAddressModeY::ClampToMinEdgeOnly: |
| coord = "max(" + coord + ", 0)"; |
| break; |
| case TensorSamplerAddressModeY::None: |
| break; |
| default: |
| std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl; |
| assert(false); |
| } |
| }; |
| |
| void out_of_bound_finalize_y(const std::string &dst) |
| { |
| const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::ClampToBorder: |
| case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: |
| case TensorSamplerAddressModeY::Skip: |
| case TensorSamplerAddressModeY::SkipMaxEdgeOnly: |
| case TensorSamplerAddressModeY::SkipMinEdgeOnly: |
| _writer->compound_statement_end(); |
| break; |
| case TensorSamplerAddressModeY::None: |
| break; |
| |
| default: |
| assert(false); |
| } |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::ClampToBorder: |
| case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: |
| _writer->write_text("else\n"); |
| _writer->compound_statement_begin(); |
| _writer->write_text(dst); |
| _writer->write_text(" = 0.0f;\n"); |
| _writer->compound_statement_end(); |
| break; |
| case TensorSamplerAddressModeY::None: |
| break; |
| |
| default: |
| assert(false); |
| } |
| }; |
| |
| void out_of_bound_initialize_z(std::string &coord) |
| { |
| std::string max = ""; |
| |
| const auto address_mode_z = _mapper.gpu_sampler().address_mode_z; |
| |
| switch(address_mode_z) |
| { |
| case TensorSamplerAddressModeZ::Skip: |
| max = _mapper.tensor_component_z(); |
| _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeZ::SkipMinEdgeOnly: |
| _writer->write_text("if(" + coord + " >= 0)\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeZ::SkipMaxEdgeOnly: |
| max = _mapper.tensor_component_z(); |
| _writer->write_text("if(" + coord + " < " + max + ")\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeZ::ClampToNearest: |
| max = _mapper.tensor_component_z(); |
| coord = "clamp(" + coord + ", 0, " + max + " - 1)"; |
| break; |
| case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly: |
| max = _mapper.tensor_component_z(); |
| coord = "min(" + coord + ", " + max + " - 1)"; |
| break; |
| case TensorSamplerAddressModeZ::ClampToMinEdgeOnly: |
| coord = "max(" + coord + ", 0)"; |
| break; |
| case TensorSamplerAddressModeZ::None: |
| break; |
| default: |
| std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl; |
| assert(false); |
| } |
| }; |
| |
| void out_of_bound_finalize_z() |
| { |
| const auto address_mode_z = _mapper.gpu_sampler().address_mode_z; |
| |
| switch(address_mode_z) |
| { |
| case TensorSamplerAddressModeZ::Skip: |
| case TensorSamplerAddressModeZ::SkipMinEdgeOnly: |
| case TensorSamplerAddressModeZ::SkipMaxEdgeOnly: |
| _writer->compound_statement_end(); |
| break; |
| case TensorSamplerAddressModeZ::None: |
| break; |
| |
| default: |
| assert(false); |
| } |
| }; |
| |
| std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const |
| { |
| std::vector<int32_t> x; |
| |
| switch(ls_leftover_vector_width) |
| { |
| case 0: |
| break; |
| case 1: |
| case 2: |
| case 3: |
| case 4: |
| case 8: |
| case 16: |
| x.push_back(ls_leftover_vector_width); |
| break; |
| case 5: |
| x.push_back(4); |
| x.push_back(1); |
| break; |
| case 6: |
| x.push_back(4); |
| x.push_back(2); |
| break; |
| case 7: |
| x.push_back(4); |
| x.push_back(3); |
| break; |
| case 9: |
| x.push_back(8); |
| x.push_back(1); |
| break; |
| case 10: |
| x.push_back(8); |
| x.push_back(2); |
| break; |
| case 11: |
| x.push_back(8); |
| x.push_back(3); |
| break; |
| case 12: |
| x.push_back(8); |
| x.push_back(4); |
| break; |
| case 13: |
| x.push_back(8); |
| x.push_back(4); |
| x.push_back(1); |
| break; |
| case 14: |
| x.push_back(8); |
| x.push_back(4); |
| x.push_back(2); |
| break; |
| case 15: |
| x.push_back(8); |
| x.push_back(4); |
| x.push_back(3); |
| break; |
| |
| default: |
| assert(false); |
| } |
| return x; |
| } |
| |
| std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, |
| const std::string &address) |
| { |
| switch(type) |
| { |
| case GpuLoadStoreType::Load: |
| if(vector_width != 1) |
| { |
| return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")"; |
| } |
| else |
| { |
| return data + " = *(" + address + ")"; |
| } |
| break; |
| case GpuLoadStoreType::Store: |
| if(vector_width != 1) |
| { |
| return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")"; |
| } |
| else |
| { |
| return "*(" + address + ") = " + data; |
| } |
| break; |
| default: |
| std::cout << "Unsupported GpuLoadStoreType" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, |
| const std::string &b) const |
| { |
| auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage); |
| assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr); |
| const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage); |
| const std::string dst_type = get_cl_data_type(_dst->format().dt, 1); |
| |
| std::string address; |
| address += "(__global "; |
| address += dst_type; |
| address += "*)("; |
| address += ptr_buf; |
| if(x != "0" && (_mapper.is_one_component_x() != true)) |
| { |
| address += " + ("; |
| address += x + ") * sizeof(" + dst_type + ")"; |
| } |
| if(y != "0" && (_mapper.is_one_component_y() != true)) |
| { |
| const std::string stride_y = _mapper.tensor_component_stride_y(); |
| address += " + ("; |
| address += y + ")"; |
| address += " * "; |
| address += stride_y; |
| } |
| if(z != "0" && (_mapper.is_one_component_z() != true)) |
| { |
| const std::string stride_z = _mapper.tensor_component_stride_z(); |
| address += " + ("; |
| address += z + ")"; |
| address += " * "; |
| address += stride_z; |
| } |
| if(b != "0" && (_mapper.is_one_component_batch() != true)) |
| { |
| const std::string stride_b = _mapper.tensor_component_stride_batch(); |
| address += " + ("; |
| address += b + ")"; |
| address += " * "; |
| address += stride_b; |
| } |
| address += ")"; |
| return address; |
| } |
| }; |
| |
| class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter |
| { |
| public: |
| static bool |
| validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst) |
| { |
| CKW_UNUSED(x); |
| |
| if(dst->format().w != 4) |
| { |
| return false; |
| } |
| if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None) |
| { |
| return false; |
| } |
| if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None) |
| { |
| return false; |
| } |
| if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load) |
| { |
| return false; |
| } |
| if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store) |
| { |
| return false; |
| } |
| if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16)) |
| { |
| return false; |
| } |
| return true; |
| /* |
| - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4 |
| - z: Only GpuSamplerAddressModeZ::None is supported |
| */ |
| } |
| |
| ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) |
| : IGpuLoadStoreHelperWriter(x, mapper, type) |
| { |
| } |
| |
| ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default; |
| |
| ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default; |
| |
| void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override |
| { |
| assert(validate(_writer, _mapper, _type, dst)); |
| |
| _dst = dst; |
| _ls_width_full = dst->format().w; |
| _coord_x = x->scalar(0, 0).str; |
| _coord_z = z->scalar(0, 0).str; |
| _coord_b = b->scalar(0, 0).str; |
| |
| /* |
| if(y) |
| { |
| // full load/store width |
| } |
| else |
| { |
| // no load/store |
| } |
| */ |
| } |
| |
| void write(const std::pair<int32_t, std::string> &y) override |
| { |
| int32_t idx_y = y.first; |
| std::string coord_y = y.second; |
| |
| // The only check required is on Y. |
| out_of_bound_initialize_y(coord_y); |
| |
| const std::string dst = _dst->vector(idx_y).str; |
| const std::string sampler = to_ls_image2d_sampler(); |
| const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b); |
| const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord); |
| |
| _writer->write_text(ls_buf); |
| _writer->write_text(";\n"); |
| |
| out_of_bound_finalize_y(dst); |
| } |
| |
| void finalize() override |
| { |
| } |
| |
| private: |
| IVectorTile *_dst{ nullptr }; |
| int32_t _ls_width_full{ 0 }; |
| std::string _coord_x{}; |
| std::string _coord_z{}; |
| std::string _coord_b{}; |
| |
| void out_of_bound_initialize_y(std::string &coord) |
| { |
| std::string max = ""; |
| |
| const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::Skip: |
| max = _mapper.tensor_component_y(); |
| _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::SkipMinEdgeOnly: |
| _writer->write_text("if(" + coord + " >= 0)\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::SkipMaxEdgeOnly: |
| max = _mapper.tensor_component_y(); |
| _writer->write_text("if(" + coord + " < " + max + ")\n"); |
| _writer->compound_statement_begin(); |
| break; |
| case TensorSamplerAddressModeY::ClampToBorder: |
| case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToNearest: |
| case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToMinEdgeOnly: |
| case TensorSamplerAddressModeY::None: |
| break; |
| default: |
| std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl; |
| assert(false); |
| } |
| }; |
| |
| void out_of_bound_finalize_y(const std::string &dst) |
| { |
| CKW_UNUSED(dst); |
| |
| const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::Skip: |
| case TensorSamplerAddressModeY::SkipMinEdgeOnly: |
| case TensorSamplerAddressModeY::SkipMaxEdgeOnly: |
| _writer->compound_statement_end(); |
| break; |
| |
| default: |
| assert(false); |
| } |
| }; |
| |
| std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string &data, |
| const std::string &sampler, const std::string &coord) |
| { |
| CKW_UNUSED(vector_width); |
| |
| auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage); |
| const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage); |
| const std::string post_fix = _dst->format().dt == DataType::Fp32 ? "f" : "h"; |
| |
| switch(type) |
| { |
| case GpuLoadStoreType::Load: |
| return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")"; |
| break; |
| case GpuLoadStoreType::Store: |
| return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")"; |
| default: |
| assert(false); |
| std::cout << "Unsupported GpuLoadStoreType" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string to_ls_image2d_sampler() const |
| { |
| const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; |
| |
| switch(address_mode_y) |
| { |
| case TensorSamplerAddressModeY::None: |
| return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST"; |
| case TensorSamplerAddressModeY::Skip: |
| case TensorSamplerAddressModeY::SkipMinEdgeOnly: |
| case TensorSamplerAddressModeY::SkipMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorder: |
| case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: |
| return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST"; |
| case TensorSamplerAddressModeY::ClampToNearest: |
| case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: |
| case TensorSamplerAddressModeY::ClampToMinEdgeOnly: |
| return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST"; |
| default: |
| std::cout << "Unsupported address_mode_coord" << std::endl; |
| assert(false); |
| return ""; |
| } |
| } |
| |
| std::string to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, |
| const std::string &b) const |
| { |
| std::string coord_x = "(" + x + ") >> 2"; |
| std::string coord_y = "("; |
| |
| if(y != "0" && (_mapper.is_one_component_y() != true)) |
| { |
| coord_y += y; |
| } |
| if(z != "0" && (_mapper.is_one_component_z() != true)) |
| { |
| const std::string dim = _mapper.tensor_component_y(); |
| coord_y += " + ("; |
| coord_y += z + ")"; |
| coord_y += " * "; |
| coord_y += dim; |
| } |
| if(b != "0" && (_mapper.is_one_component_batch() != true)) |
| { |
| const std::string dim0 = _mapper.tensor_component_y(); |
| const std::string dim1 = _mapper.tensor_component_z(); |
| coord_y += " + ("; |
| coord_y += b + ")"; |
| coord_y += " * "; |
| coord_y += dim0; |
| coord_y += " * "; |
| coord_y += dim1; |
| } |
| coord_y += ")"; |
| return "(int2)(" + coord_x + ", " + coord_y + ")"; |
| } |
| }; |
| |
| /** IGpuLoadStoreHelperWriter factory class */ |
| class ClLoadStoreHelperWriterFactory final |
| { |
| public: |
| /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper |
| * |
| * |
| * @return IGpuLoadStoreHelperWriter |
| */ |
| static std::unique_ptr<IGpuLoadStoreHelperWriter> |
| create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) |
| { |
| const auto tensor_storage = mapper.gpu_sampler().storage; |
| switch(tensor_storage) |
| { |
| case GpuSamplerTensorStorage::BufferUint8Ptr: |
| return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type); |
| case GpuSamplerTensorStorage::Image2dReadOnly: |
| case GpuSamplerTensorStorage::Image2dWriteOnly: |
| return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type); |
| default: |
| std::cout << "Unsupported Gpu tensor storage" << std::endl; |
| assert(false); |
| return nullptr; |
| } |
| } |
| }; |
| |
| // This utility method needs to go in utils.h |
| inline bool is_tile_scalar(const IVectorTile *x) |
| { |
| return x->format().w == 1 && x->format().h == 1; |
| } |
| |
| class ClKernelWriter : public IGpuKernelWriter |
| { |
| public: |
| ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x) |
| { |
| _data = x; |
| _attr = attr; |
| } |
| |
| ClKernelWriter(const ClKernelWriter &) = default; |
| |
| ClKernelWriter &operator=(const ClKernelWriter &) = default; |
| |
| // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure |
| // there are no conflicts or ambiguity in the code |
| void set_IdSpace(int32_t id) override |
| { |
| _data->tiles.set_IdSpace(id); |
| _data->arguments.set_IdSpace(id); |
| } |
| |
| void import_tile(const std::string &dst_name, const IVectorTile *src) override |
| { |
| _data->tiles.insert(dst_name, src); |
| } |
| |
| void declare_argument(const std::string &name, const TensorInfo &tensor) override |
| { |
| assert(_data->arguments[name] == nullptr); |
| _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value); |
| } |
| |
| void declare_tile(const std::string &name, const TileInfo &format) override |
| { |
| assert(_data->tiles[name] == nullptr); |
| _data->tiles.insert(name, format); |
| |
| IVectorTile *x = _data->tiles[name]; |
| |
| for(auto &t : x->underlying_source_variables()) |
| { |
| _data->code += t.type.str + " " + t.str + ";\n"; |
| } |
| } |
| |
| void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, |
| DataType dt) override |
| { |
| assert(_data->tiles[name] == nullptr); |
| _data->tiles.insert(name, in, dt); |
| // Note: A constant does not need to be declared in the code |
| } |
| |
| void write_text(const std::string &x) override |
| { |
| _data->code += x; |
| } |
| |
| void compound_statement_begin() override |
| { |
| _data->tiles.increment_registry_level(); |
| _data->code += "{\n"; |
| } |
| |
| void compound_statement_end() override |
| { |
| _data->tiles.decrement_registry_level(); |
| _data->code += "}\n"; |
| } |
| |
| void op_get_global_id(const Operand &dst_var, int32_t dim) override |
| { |
| assert(dst_var.type() == OperandType::Tile); |
| assert(_data->tiles.has_tile(dst_var.value())); |
| assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable |
| |
| auto var = _data->tiles[dst_var.value()]; |
| |
| _data->code += var->scalar(0, 0).str; |
| _data->code += " = get_global_id("; |
| _data->code += std::to_string(dim); |
| _data->code += ");\n"; |
| }; |
| |
| void op_get_global_coord(const Operand &o_dst, const Operand &o_step, const TensorOperand &o_tensor, |
| int32_t dim) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| auto dst = operands.unpack(o_dst); |
| auto step = operands.unpack(o_step); |
| |
| // Validation: Check that x, y and z are scalar |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| auto tensor = tensor_operands.unpack(o_tensor); |
| auto gpu_sampler = o_tensor.sampler(); |
| |
| GpuTensor3dMapper mapper(tensor, gpu_sampler); |
| |
| switch(dim) |
| { |
| case 0: |
| if(mapper.is_one_component_x()) |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = 0;\n"; |
| } |
| else |
| { |
| if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) |
| { |
| // Validation: Check: fixed tensor shape |
| // TO BE CHANGED |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = get_global_id(0) * "; |
| _data->code += step->scalar(0, 0).str; |
| _data->code += ";\n"; |
| } |
| else |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = get_global_id(0) * "; |
| _data->code += step->scalar(0, 0).str; |
| _data->code += ";\n"; |
| } |
| } |
| break; |
| case 1: |
| if(mapper.is_one_component_y()) |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = 0;\n"; |
| } |
| else |
| { |
| if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin) |
| { |
| } |
| else |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = get_global_id(1) * "; |
| _data->code += step->scalar(0, 0).str; |
| _data->code += ";\n"; |
| } |
| } |
| break; |
| case 2: |
| if(mapper.is_one_component_z()) |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = 0;\n"; |
| } |
| else |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = get_global_id(2) * "; |
| _data->code += step->scalar(0, 0).str; |
| _data->code += ";\n"; |
| } |
| break; |
| default: |
| break; |
| } |
| }; |
| |
| void op_get_global_batch(const Operand &o_dst, const TensorOperand &o_tensor) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *dst = operands.unpack(o_dst); |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); |
| auto gpu_sampler = o_tensor.sampler(); |
| |
| GpuTensor3dMapper mapper(tensor, gpu_sampler); |
| |
| if(mapper.is_one_component_batch()) |
| { |
| _data->code += dst->scalar(0, 0).str; |
| _data->code += " = 0;\n"; |
| } |
| else |
| { |
| std::cout << "Unsupported batched computation" << std::endl; |
| assert(false); |
| } |
| }; |
| |
| void op_get_global_size(const Operand &dst_var, int32_t dim) override |
| { |
| assert(dst_var.type() == OperandType::Tile); |
| assert(_data->tiles.has_tile(dst_var.value())); |
| assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable |
| |
| auto var = _data->tiles[dst_var.value()]; |
| |
| _data->code += var->scalar(0, 0).str; |
| _data->code += " = get_global_size("; |
| _data->code += std::to_string(dim); |
| _data->code += ");\n"; |
| } |
| |
| void op_unary_expression(const Operand &dst_name, UnaryOp op, const Operand &src_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *src = operands.unpack(src_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_w = dst->format().w; |
| const int32_t dst_h = dst->format().h; |
| const int32_t src_w = src->format().w; |
| const std::string dt = dst->underlying_source_variables()[0].type.str; |
| |
| const bool broadcast_src_x = dst_w != 1 && src_w == 1; |
| |
| const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : ""; |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| _data->code += to_string(op); |
| _data->code += src_prefix + src->vector(y).str; |
| _data->code += ";\n"; |
| } |
| } |
| |
| void op_binary_expression(const Operand &dst_name, const Operand &lhs_name, BinaryOp op, |
| const Operand &rhs_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *lhs = operands.unpack(lhs_name); |
| const IVectorTile *rhs = operands.unpack(rhs_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_w = dst->format().w; |
| const int32_t dst_h = dst->format().h; |
| assert(lhs != nullptr); |
| const int32_t lhs_w = lhs->format().w; |
| const int32_t rhs_w = rhs->format().w; |
| |
| if(op == BinaryOp::MatMul_Nt_T) |
| { |
| assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16)); |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| for(int32_t x = 0; x < dst_w; ++x) |
| { |
| for(int32_t k = 0; k < lhs_w; ++k) |
| { |
| _data->code += dst->scalar(x, y).str; |
| _data->code += " = fma("; |
| _data->code += lhs->scalar(k, y).str; |
| _data->code += ", "; |
| _data->code += rhs->scalar(k, x).str; |
| _data->code += ", "; |
| _data->code += dst->scalar(x, y).str; |
| _data->code += ");\n"; |
| } |
| } |
| } |
| |
| return; |
| } |
| |
| const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1; |
| const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1; |
| |
| const std::string lhs_prefix = broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : ""; |
| const std::string rhs_prefix = broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : ""; |
| const std::string op_str = to_string(op); |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| _data->code += lhs_prefix + lhs->vector(y).str; |
| _data->code += " "; |
| _data->code += op_str; |
| _data->code += " "; |
| _data->code += rhs_prefix + rhs->vector(y).str; |
| _data->code += ";\n"; |
| } |
| }; |
| |
| void op_cast_expression(const Operand &o_dst, const Operand &o_src, ConvertPolicy policy) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *src = operands.unpack(o_src); |
| const IVectorTile *dst = operands.unpack(o_dst); |
| // const int32_t dst_w = dst->format().w; |
| const int32_t dst_h = dst->format().h; |
| const std::string dt = dst->underlying_source_variables()[0].type.str; |
| const bool is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16); |
| const std::string sat = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : ""); |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = convert_" + dt + sat + "("; |
| _data->code += src->vector(y).str; |
| _data->code += ");\n"; |
| } |
| }; |
| |
| void op_assign(const Operand &dst_name, const Operand &src_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *src = operands.unpack(src_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_w = dst->format().w; |
| const int32_t dst_h = dst->format().h; |
| const int32_t src_w = src->format().w; |
| const std::string dt = dst->underlying_source_variables()[0].type.str; |
| |
| const bool broadcast_src_x = dst_w != 1 && src_w == 1; |
| |
| const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : ""; |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| _data->code += src_prefix + src->vector(y).str; |
| _data->code += ";\n"; |
| } |
| } |
| |
| void |
| op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *src = operands.unpack(src_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_h = dst->format().h; |
| const std::string dt = dst->underlying_source_variables()[0].type.str; |
| |
| // Always perform an explicit cast. This automatically covers at least the 2 scenarios: |
| // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting |
| // 2. Ensure non-ambiguity over function overloads. |
| // E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float, |
| // it avoids ambiguous function calls |
| const std::string src_prefix = "(" + dt + ")"; |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| |
| switch(func) |
| { |
| case UnaryFunction::Exp: |
| _data->code += "exp("; |
| break; |
| case UnaryFunction::Tanh: |
| _data->code += "tanh("; |
| break; |
| case UnaryFunction::Sqrt: |
| _data->code += "sqrt("; |
| break; |
| case UnaryFunction::Erf: |
| _data->code += "erf("; |
| break; |
| case UnaryFunction::Fabs: |
| _data->code += "fabs("; |
| break; |
| case UnaryFunction::Log: |
| _data->code += "log("; |
| break; |
| case UnaryFunction::SizeOf: |
| _data->code += "sizeof("; |
| break; |
| case UnaryFunction::Round: |
| _data->code += "round("; |
| break; |
| default: |
| CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used."); |
| } |
| |
| _data->code += src_prefix + src->vector(y).str; |
| _data->code += ");\n"; |
| } |
| } |
| |
| void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *first = operands.unpack(first_name); |
| const IVectorTile *second = operands.unpack(second_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_h = dst->format().h; |
| const auto datatype = dst->underlying_source_variables()[0].type; |
| const std::string datatype_str = datatype.str; |
| |
| // Always perform an explicit cast. See similar comments in op_unary_elementwise_function |
| const std::string first_prefix = "(" + datatype_str + ")"; |
| const std::string second_prefix = "(" + datatype_str + ")"; |
| |
| const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16); |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| |
| switch(func) |
| { |
| case BinaryFunction::Min: |
| _data->code += is_float ? "fmin(" : "min("; |
| break; |
| case BinaryFunction::Max: |
| _data->code += is_float ? "fmax(" : "max("; |
| break; |
| default: |
| CKW_ASSERT_MSG(false, "Unexpected BinaryFunction used."); |
| } |
| |
| _data->code += first_prefix + first->vector(y).str; |
| _data->code += ", "; |
| _data->code += second_prefix + second->vector(y).str; |
| _data->code += ");\n"; |
| } |
| } |
| |
| void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *first = operands.unpack(first_name); |
| const IVectorTile *second = operands.unpack(second_name); |
| const IVectorTile *third = operands.unpack(third_name); |
| const IVectorTile *dst = operands.unpack(dst_name); |
| |
| const int32_t dst_h = dst->format().h; |
| const std::string dt = dst->underlying_source_variables()[0].type.str; |
| |
| // Always perform an explicit cast. See similar comments in op_unary_elementwise_function |
| const std::string first_prefix = "(" + dt + ")"; |
| const std::string second_prefix = "(" + dt + ")"; |
| const std::string third_prefix = "(" + dt + ")"; |
| |
| // Broadcasting on Y is automatic |
| for(int32_t y = 0; y < dst_h; ++y) |
| { |
| _data->code += dst->vector(y).str; |
| _data->code += " = "; |
| |
| switch(func) |
| { |
| case TernaryFunction::Select: |
| _data->code += "select("; |
| break; |
| default: |
| CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used."); |
| } |
| |
| _data->code += first_prefix + first->vector(y).str; |
| _data->code += ", "; |
| _data->code += second_prefix + second->vector(y).str; |
| _data->code += ", "; |
| _data->code += third_prefix + third->vector(y).str; |
| _data->code += ");\n"; |
| } |
| } |
| |
| void op_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *lhs = operands.unpack(o_lhs); |
| const IVectorTile *rhs = operands.unpack(o_rhs); |
| |
| assert(is_tile_scalar(lhs)); |
| assert(is_tile_scalar(rhs)); |
| |
| _data->code += "if("; |
| _data->code += lhs->scalar(0, 0).str; |
| _data->code += " "; |
| _data->code += to_string(op); |
| _data->code += " "; |
| _data->code += rhs->scalar(0, 0).str; |
| _data->code += ")\n"; |
| } |
| |
| void op_else_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override |
| { |
| _data->code += "else "; |
| op_if_header(o_lhs, op, o_rhs); |
| } |
| |
| void op_else_header() override |
| { |
| _data->code += "else\n"; |
| } |
| |
| void op_for_loop_header(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, const Operand &update_var_name, AssignmentOp update_op, const Operand& update_value_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *var = operands.unpack(var_name); |
| const IVectorTile *cond_value = operands.unpack(cond_value_name); |
| const IVectorTile *update_var = operands.unpack(update_var_name); |
| const IVectorTile *update_value = operands.unpack(update_value_name); |
| |
| const int32_t dst_w = var->format().w; |
| const int32_t dst_h = var->format().h; |
| |
| // It must be a scalar variable |
| CKW_UNUSED(dst_w, dst_h); |
| assert(dst_w == 1); |
| assert(dst_h == 1); |
| |
| _data->code += "for(; "; |
| _data->code += var->scalar(0, 0).str; |
| _data->code += " "; |
| _data->code += to_string(cond_op); |
| _data->code += " " + cond_value->scalar(0, 0).str + "; "; |
| _data->code += update_var->scalar(0, 0).str; |
| _data->code += " "; |
| _data->code += to_string(update_op); |
| _data->code += " " + update_value->scalar(0, 0).str + ")"; |
| _data->code += "\n"; |
| } |
| |
| void op_load_immediate(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x, |
| const Operand &o_y, const Operand &o_z, const Operand &o_batch_idx, |
| const Operand &dilation_y) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| |
| // Not const as it requires changes to 'load_writer'. |
| IVectorTile *dst = operands.unpack(o_dst); |
| IVectorTile *x = operands.unpack(o_x); |
| IVectorTile *y = operands.unpack(o_y); |
| IVectorTile *z = operands.unpack(o_z); |
| IVectorTile *dil_y = operands.unpack(dilation_y); |
| IVectorTile *b = operands.unpack(o_batch_idx); |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); |
| auto gpu_sampler = o_tensor.sampler(); |
| |
| GpuTensor3dMapper mapper(tensor, gpu_sampler); |
| |
| auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load); |
| |
| // Initialize the constant part |
| load_writer->initialize(dst, x, z, b); |
| |
| for(int i = 0; i < dst->format().h; ++i) |
| { |
| std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i); |
| if(dil_y->scalar(0, 0).str != "1") |
| { |
| coord_y += " * " + dil_y->scalar(0, 0).str; |
| } |
| load_writer->write(std::make_pair(i, coord_y)); |
| } |
| |
| load_writer->finalize(); |
| } |
| |
| void op_load_indirect(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x, |
| const Operand &o_indirect_h, const Operand &o_z, |
| const Operand &o_batch_idx) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| |
| // Not const as it requires changes to 'load_writer'. |
| IVectorTile *dst = operands.unpack(o_dst); |
| IVectorTile *x = operands.unpack(o_x); |
| IVectorTile *y_ind = operands.unpack(o_indirect_h); |
| IVectorTile *z = operands.unpack(o_z); |
| IVectorTile *b = operands.unpack(o_batch_idx); |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); |
| auto gpu_sampler = o_tensor.sampler(); |
| |
| GpuTensor3dMapper mapper(tensor, gpu_sampler); |
| |
| auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load); |
| |
| // Initialize the constant part |
| load_writer->initialize(dst, x, z, b); |
| |
| for(int i = 0; i < dst->format().h; ++i) |
| { |
| load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str)); |
| } |
| |
| load_writer->finalize(); |
| } |
| |
| void op_store_immediate(const TensorOperand &tensor_name, const Operand &src_name, const Operand &x_name, |
| const Operand &y_name, const Operand &z_name, |
| const Operand &batch_index_name) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| |
| // Not const as it requires changes to 'load_writer'. |
| IVectorTile *src = operands.unpack(src_name); |
| IVectorTile *x = operands.unpack(x_name); |
| IVectorTile *y = operands.unpack(y_name); |
| IVectorTile *z = operands.unpack(z_name); |
| IVectorTile *b = operands.unpack(batch_index_name); |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| IGpuTensorArgument *tensor = tensor_operands.unpack(tensor_name); |
| auto gpu_sampler = tensor_name.sampler(); |
| |
| GpuTensor3dMapper mapper(tensor, gpu_sampler); |
| |
| auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store); |
| |
| // Initialize the constant part |
| store_writer->initialize(src, x, z, b); |
| |
| int32_t tile_h = src->format().h; |
| |
| for(int m0 = tile_h - 1; m0 >= 0; m0--) |
| { |
| store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0))); |
| } |
| |
| store_writer->finalize(); |
| } |
| |
| void op_return() override |
| { |
| _data->code += "return;\n"; |
| } |
| |
| void util_get_indirect_buffer(const Operand &o_dst, const TensorOperand &o_tensor, const Operand &o_x, |
| const Operand &o_y, const Operand &o_x_off, const Operand &o_y_off) override |
| { |
| OperandUnpacker operands(_data->tiles, _data->arguments); |
| const IVectorTile *dst = operands.unpack(o_dst); |
| const IVectorTile *x = operands.unpack(o_x); |
| const IVectorTile *y = operands.unpack(o_y); |
| const IVectorTile *x_off = operands.unpack(o_x_off); |
| const IVectorTile *y_off = operands.unpack(o_y_off); |
| |
| TensorOperandUnpacker tensor_operands(_data->arguments); |
| IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); |
| |
| assert(dst->format().w == 1); |
| assert(x->format().w == 1); |
| assert(y->format().w == 1); |
| assert(x_off->format().w == 1); |
| assert(y_off->format().w == 1); |
| assert(dst->format().dt == DataType::Int32); |
| assert(x->format().dt == DataType::Int32); |
| assert(y->format().dt == DataType::Int32); |
| assert(x_off->format().dt == DataType::Int32); |
| assert(y_off->format().dt == DataType::Int32); |
| |
| const std::string width = tensor->component(TensorComponentType::Dim1); |
| const std::string height = tensor->component(TensorComponentType::Dim2); |
| const std::string wxh = tensor->component(TensorComponentType::Dim1xDim2); |
| /* |
| int x_s; |
| int y_s; |
| x_s = (xi_0 + x_k); |
| y_s = (yi_0 + y_k); |
| mi_0 = x_s + y_s * width + b * widthxheight; |
| mi_0 = select(-1, mi_0, x_s >= 0); |
| mi_0 = select(-1, mi_0, y_s >= 0); |
| mi_0 = select(-1, mi_0, x_s < 128); |
| mi_0 = select(-1, mi_0, y_s < 128); |
| */ |
| compound_statement_begin(); |
| declare_tile("_x_s", TileInfo(DataType::Int32)); |
| declare_tile("_y_s", TileInfo(DataType::Int32)); |
| auto x_s = operands.unpack(Operand("_x_s")); |
| auto y_s = operands.unpack(Operand("_y_s")); |
| for(int i = 0; i < dst->format().h; ++i) |
| { |
| // x_s = (xi_0 + x_k); |
| // y_s = (yi_0 + y_k); |
| _data->code += x_s->scalar(0, i).str; |
| _data->code += " = ("; |
| _data->code += x->scalar(0, i).str; |
| _data->code += " + "; |
| _data->code += x_off->scalar(0, i).str; |
| _data->code += ");\n"; |
| _data->code += y_s->scalar(0, i).str; |
| _data->code += " = ("; |
| _data->code += y->scalar(0, i).str; |
| _data->code += " + "; |
| _data->code += y_off->scalar(0, i).str; |
| _data->code += ");\n"; |
| // mi_0 = x_s + y_s * width; |
| _data->code += dst->scalar(0, i).str; |
| _data->code += " = "; |
| _data->code += x_s->scalar(0, i).str; |
| _data->code += " + "; |
| _data->code += y_s->scalar(0, i).str; |
| _data->code += " * " + width + ";\n"; |
| // mi_0 = select(wxh, mi_0, x_s >= 0); |
| _data->code += dst->scalar(0, i).str; |
| _data->code += " = select(-1, "; |
| _data->code += dst->scalar(0, i).str; |
| _data->code += ", "; |
| _data->code += x_s->scalar(0, i).str; |
| _data->code += " >= 0);\n"; |
| // mi_0 = select(wxh, mi_0, y_s >= 0); |
| _data->code += dst->scalar(0, i).str; |
| _data->code += " = select(-1, "; |
| _data->code += dst->scalar(0, i).str; |
| _data->code += ", "; |
| _data->code += y_s->scalar(0, i).str; |
| _data->code += " >= 0);\n"; |
| // mi_0 = select(wxh, mi_0, x_s < width); |
| _data->code += dst->scalar(0, i).str; |
| _data->code += " = select(-1, "; |
| _data->code += dst->scalar(0, i).str; |
| _data->code += ", "; |
| _data->code += x_s->scalar(0, i).str; |
| _data->code += " < "; |
| _data->code += width + ");\n"; |
| // mi_0 = select(wxh, mi_0, y_s < height); |
| _data->code += dst->scalar(0, i).str; |
| _data->code += " = select(-1, "; |
| _data->code += dst->scalar(0, i).str; |
| _data->code += ", "; |
| _data->code += y_s->scalar(0, i).str; |
| _data->code += " < "; |
| _data->code += height + ");\n"; |
| } |
| compound_statement_end(); |
| } |
| |
| private: |
| GpuKernelWriterDataHolder *_data{ nullptr }; |
| GpuKernelWriterAttribute *_attr{ nullptr }; |
| }; |
| |
| /** IGpuKernelWriter factory class */ |
| class GpuKernelWriterFactory final |
| { |
| public: |
| /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language |
| * |
| * @param[in] gpu GPU target |
| * |
| * @return IGpuKernelWriter |
| */ |
| static std::unique_ptr<IGpuKernelWriter> |
| create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x) |
| { |
| switch(x->programming_language()) |
| { |
| case GpuTargetLanguage::OpenCL: |
| return std::make_unique<ClKernelWriter>(attr, x); |
| default: |
| std::cout << "Unsupported Gpu programming language" << std::endl; |
| assert(false); |
| return nullptr; |
| } |
| } |
| }; |
| |
| inline int32_t |
| adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx) |
| { |
| auto tensor = tensor_info_id->shape; |
| |
| int32_t dim[3] = { 0 }; |
| |
| switch(tensor_format) |
| { |
| case TensorSamplerFormat::C_W_H: |
| dim[0] = tensor[0]; |
| dim[1] = tensor[1]; |
| dim[2] = tensor[2]; |
| break; |
| case TensorSamplerFormat::C_WH_1: |
| dim[0] = tensor[0]; |
| dim[1] = tensor[1] * tensor[2]; |
| dim[2] = 1; |
| break; |
| default: |
| std::cout << "Unsupported tensor format" << std::endl; |
| assert(false); |
| break; |
| } |
| |
| return std::min(step, dim[idx]); |
| } |
| |
| } // namespace prototype |
| } // namespace ckw |
| |
| #endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H |