blob: 45f1b3d4640cd9e866886ba347436ef14f44252a [file] [log] [blame]
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#ifndef CKW_SRC_PROTOTYPE_H
26#define CKW_SRC_PROTOTYPE_H
27
28#include <vector>
29#include <map>
30#include <string>
31#include <cstdint> // int32_t
32#include <iostream> // cout (to be removed)
33#include <cassert> // assert (to be removed)
34#include <unordered_map>
35#include <chrono>
36#include <cmath>
37#include <memory>
38#include <algorithm>
39#include <array>
40#include <stdexcept>
41
42#include "ckw/Types.h"
43#include "ckw/TensorInfo.h"
44#include "ckw/Error.h"
45
46namespace ckw
47{
48namespace prototype {
49
50// Dummy data structure for Size2D
51using Size2D = std::vector<int32_t>;
52
53// Dummy Status
54using Status = void;
55
56enum class ComponentType : int32_t
57{
58 Complex = 0,
59 Simple = 1,
60 Unfusable = 2
61};
62
63enum class GpuCompilationSpeed
64{
65 Fast = 0x00, // fast compilation may increase the latency of the network
66 Slow = 0x01 // slow compilation may decrease the latency of the network
67};
68
69enum class GpuExtensions
70{
71 Fp16,
72 Dot8,
73 Mmul,
74 FastMath
75};
76
77struct TensorInfo
78{
79 TensorShape shape { {0} };
80 DataType data_type { DataType::Unknown };
81 TensorDataLayout data_layout { TensorDataLayout::Nhwc };
82 int32_t id { -1 };
83};
84
85struct ComponentAttribute
86{
87 GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
88 bool overwrite_tile { true };
89};
90
91inline std::string data_type_to_cl_type(DataType dt)
92{
93 switch(dt)
94 {
95 case DataType::Fp32:
96 return "float";
97 case DataType::Fp16:
98 return "half";
99 case DataType::Int8:
100 return "char";
101 case DataType::Uint8:
102 return "uchar";
103 case DataType::Uint16:
104 return "ushort";
105 case DataType::Int16:
106 return "short";
107 case DataType::Uint32:
108 return "uint";
109 case DataType::Int32:
110 return "int";
111 case DataType::Bool:
112 return "bool";
113 default:
114 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +0100115 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +0100116 }
117}
118
119inline int32_t width_to_cl_vector_size(int32_t width)
120{
121 switch(width)
122 {
123 case 1:
124 return 1;
125 case 2:
126 return 2;
127 case 3:
128 return 3;
129 case 4:
130 return 4;
131 case 5:
132 case 6:
133 case 7:
134 case 8:
135 return 8;
136 case 9:
137 case 10:
138 case 11:
139 case 12:
140 case 13:
141 case 14:
142 case 15:
143 case 16:
144 return 16;
145 default:
146 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +0100147 return 0;
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +0100148 }
149}
150
151inline std::string get_cl_data_type(DataType dt, int32_t width)
152{
153 std::string data_type;
154 int32_t w = width_to_cl_vector_size(width);
155 data_type += data_type_to_cl_type(dt);
156 if(w != 1)
157 {
158 data_type += std::to_string(w);
159 }
160 return data_type;
161}
162
163inline std::string to_opencl_store(int32_t vector_length)
164{
165 if(vector_length != 1)
166 {
167 return "vstore" + std::to_string(vector_length) + "(";
168 }
169 else
170 {
171 return "*(";
172 }
173}
174
175struct TileInfo
176{
177 TileInfo() {}
178 TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
179 TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
180 TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
181 DataType dt{ DataType::Unknown }; // Data type of the tile
182 int32_t w{ 0 }; // Width (i.e. c0 - portion of the channels)
183 int32_t h{ 0 }; // Height (i.e. s0 - portion of the spatial dimensions)
184};
185
186inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
187{
188 o << a.w << " x " << a.h;
189 return o;
190}
191
192struct DataTypeAsString
193{
194 std::string str { "" };
195 DataType dt { DataType::Unknown };
196 int32_t size { 1 };
197};
198
199struct ValueAsString
200{
201 std::string str { "" };
202 DataTypeAsString type { };
203};
204
205// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
206// A Tile is a collection of variables used to express a 2D data.
207class IScalarTile
208{
209public:
210 virtual ~IScalarTile() = default;
211 /** Method to get the scalar variable from a tile
212 * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
213 * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
214 *
215 * @return the scalar variable as a string
216 */
217 virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
218 /** Method to get the list of underlying variable names used by the tile
219 *
220 * @return the list of variable names
221 */
222 virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
223 /** Method to get the name of the tile.
224 *
225 * @return the name of the tile
226 */
227 std::string name() const
228 {
229 return _basename;
230 }
231 /** Method to get the tile format
232 *
233 * @return the format
234 */
235 TileInfo format() const
236 {
237 return _format;
238 }
239 /** Method to know whether the tile is assignable or not (constant)
240 *
241 * @return true if the tile is assignable
242 */
243 virtual bool is_assignable() const = 0;
244 /** Method to know whether the tile needs to be declared
245 *
246 * @return true if the tile needs to be declared in the code before being used
247 */
248 virtual bool need_declaration() const = 0;
249protected:
250 TileInfo _format { }; // Tile format
251 std::string _basename { "" }; // Tile name
252};
253
254// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
255// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
256class IVectorTile : public IScalarTile
257{
258public:
259 virtual ~IVectorTile() = default;
260 /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
261 * The user can query the list of supported width for the vectors through preferred_vector_sizes().
262 *
263 * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
264 *
265 * @return the vector variable as a string
266 */
267 virtual ValueAsString vector(int32_t y) const = 0;
268 /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
269 *
270 * @return the vector variable as a string
271 */
272 virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
273 /** Method to get the preferred vector sizes.
274 *
275 * @return a vector with the preferred vector sizes
276 */
277 //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
278};
279
280class ClTile : public IVectorTile
281{
282public:
283 ClTile(const std::string& name, TileInfo format)
284 {
285 _format = format;
286 _basename = name;
287 }
288
289 ValueAsString scalar(int32_t x, int32_t y) const override
290 {
291 x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
292 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
293
294 ValueAsString t;
295 t.str = build_variable_name(y);
296 t.type.str = get_cl_data_type(_format.dt, 1);
297 t.type.dt = _format.dt;
298 t.type.size = 1;
299
300 // Check required because if the width has only one element, we cannot use .s0
301 if(_format.w != 1)
302 {
303 // Automatic broadcasting
304 t.str += ".s" + std::to_string(x);
305 }
306
307 return t;
308 }
309
310 ValueAsString vector(int32_t y) const override
311 {
312 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
313
314 ValueAsString t;
315 t.str = build_variable_name(y);
316 t.type.str = get_cl_data_type(_format.dt, _format.w);
317 t.type.dt = _format.dt;
318 t.type.size = _format.w;
319 return t;
320 }
321
322 ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
323 {
324 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
325
326 ValueAsString t;
327 t.str = build_variable_name(y);
328 t.type.str = get_cl_data_type(_format.dt, width);
329 t.type.dt = _format.dt;
330 t.type.size = width;
331
332 if(_format.w != 1)
333 {
334 t.str += ".s";
335 for(int i = 0; i < width; ++i)
336 {
337 t.str += to_scalar_hex(x_start + i);
338 }
339 }
340 return t;
341 }
342
343 std::vector<ValueAsString> underlying_source_variables() const override
344 {
345 std::vector<ValueAsString> vars;
346 for(int32_t y = 0; y < _format.h; ++y)
347 {
348 ValueAsString t;
349 t.str = build_variable_name(y);
350 t.type.str = get_cl_data_type(_format.dt, _format.w);
351 t.type.dt = _format.dt;
352 t.type.size = _format.w;
353 vars.push_back(t);
354 }
355 return vars;
356 }
357
358 bool is_assignable() const override
359 {
360 return true;
361 }
362
363 bool need_declaration() const override
364 {
365 return true;
366 }
367
368private:
369 std::string build_variable_name(int32_t y) const
370 {
371 std::string var_name = _basename;
372
373 if(_format.h == 1)
374 {
375 return var_name;
376
377 }
378 else
379 {
380 var_name += "_";
381 var_name += std::to_string(y);
382 }
383
384 return var_name;
385 }
386
387 std::string to_scalar_hex(int32_t x) const
388 {
389 switch(x)
390 {
391 case 0:
392 case 1:
393 case 2:
394 case 3:
395 case 4:
396 case 5:
397 case 6:
398 case 7:
399 case 8:
400 case 9:
401 return std::to_string(x);
402 case 10:
403 return "A";
404 case 11:
405 return "B";
406 case 12:
407 return "C";
408 case 13:
409 return "D";
410 case 14:
411 return "E";
412 case 15:
413 return "F";
414 default:
Viet-Hoa Doe1880f02023-06-28 10:25:35 +0100415 std::cout << "Unsupported hexadecimal value" << std::endl;
416 assert(false);
417 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +0100418 }
419 }
420};
421
422// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
423class ClConstantTile : public IVectorTile
424{
425public:
426 ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
427 {
428 _format.w = in[0].size();
429 _format.h = in.size();
430 _format.dt = dt;
431
432 _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
433
434 for(int32_t y = 0; y < _format.h; ++y)
435 {
436 for(int32_t x = 0; x < _format.w; ++x)
437 {
438 _data[y][x] = in[y][x];
439 }
440 }
441 }
442
443 ValueAsString scalar(int32_t x, int32_t y) const override
444 {
445 x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
446 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
447
448 ValueAsString t;
449 t.str = _data[y][x];
450 t.type.str = get_cl_data_type(_format.dt, 1);
451 t.type.dt = _format.dt;
452 t.type.size = 1;
453
454 return t;
455 }
456
457 ValueAsString vector(int32_t y) const override
458 {
459 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
460
461 return vector(0, _format.w, y);
462 }
463
464 ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
465 {
466 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
467
468 ValueAsString t;
469 t.str = "";
470 t.type.str = get_cl_data_type(_format.dt, width);
471 t.type.dt = _format.dt;
472 t.type.size = width;
473
474 if(width > 1)
475 {
476 t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
477 }
478
479 int32_t x = x_start;
480 for(; x < width - 1; ++x)
481 {
482 t.str += scalar(x, y).str;
483 t.str += ", ";
484 }
485 t.str += scalar(x, y).str;
486
487 if(width > 1)
488 {
489 t.str += "))";
490 }
491
492 return t;
493 }
494
495 std::vector<ValueAsString> underlying_source_variables() const override
496 {
497 std::vector<ValueAsString> vars;
498
499 for(int32_t y = 0; y < _format.h; ++y)
500 {
501 for(int32_t x = 0; x < _format.w; ++x)
502 {
503 ValueAsString t;
504 t.str = _data[y][x];
505 t.type.str = get_cl_data_type(_format.dt, 1);
506 t.type.dt = _format.dt;
507 t.type.size = 1;
508 vars.push_back(t);
509 }
510 }
511
512 return vars;
513 }
514
515 bool is_assignable() const override
516 {
517 return false;
518 }
519
520 bool need_declaration() const override
521 {
522 return false;
523 }
524
525private:
526 std::vector<std::vector<std::string>> _data{};
527};
528
529enum class TensorComponentIndex : int32_t
530{
531 IndexMask = 0x0000000f,
532};
533
534enum class TensorComponentType : int32_t
535{
536 OffsetFirstElement = 0x00000100,
537 Stride = 0x00001000,
538 Dimension = 0x00010000,
539 FoldedDimension = 0x00100000,
540 Constant = 0x01000000
541};
542
543enum class TensorComponent : int32_t
544{
545 Unknown = 0x00000000,
546 OffsetFirstElement = 0x00000100,
547 Stride1 = 0x00001001,
548 Stride2 = 0x00001002,
549 Stride3 = 0x00001003,
550 Stride4 = 0x00001004,
551 Dim0 = 0x00010000,
552 Dim1 = 0x00010001,
553 Dim2 = 0x00010002,
554 Dim3 = 0x00010003,
555 Dim4 = 0x00010004,
556 C = 0x00010000, // Dim0
557 W = 0x00010001, // Dim1
558 H = 0x00010002, // Dim2
559 D = 0x00010003,
560 N = 0x00010004,
561 Dim1xDim2 = 0x00100021,
562 Dim1xDim2xDim3 = 0x00100321,
563 WxH = 0x00100021,
564 WxHxD = 0x00100321
565};
566
567inline std::string to_string(TensorComponent x)
568{
569 switch(x)
570 {
571 case TensorComponent::Unknown:
572 return "Unknown";
573 case TensorComponent::OffsetFirstElement:
574 return "OffsetFirstElement";
575 case TensorComponent::Stride1:
576 return "Stride1";
577 case TensorComponent::Stride2:
578 return "Stride2";
579 case TensorComponent::Stride3:
580 return "Stride3";
581 case TensorComponent::Stride4:
582 return "Stride4";
583 case TensorComponent::Dim0:
584 return "Dim0";
585 case TensorComponent::Dim1:
586 return "Dim1";
587 case TensorComponent::Dim2:
588 return "Dim2";
589 case TensorComponent::Dim3:
590 return "Dim3";
591 case TensorComponent::Dim4:
592 return "Dim4";
593 case TensorComponent::Dim1xDim2:
594 return "Dim1xDim2";
595 case TensorComponent::Dim1xDim2xDim3:
596 return "Dim1xDim2xDim3";
597 default:
598 assert(false);
599 }
600}
601
602class ITensorArgument
603{
604public:
605 virtual ~ITensorArgument() = default;
606 /** Method to get the tensor component as a string
607 *
608 * @param[in] x tensor component to query
609 *
610 * @return the tensor component as a string
611 */
612 virtual std::string component(TensorComponent x) = 0;
613 /** Method to get the tensor component type declaration as a string
614 *
615 * @return the tensor component type declaration as a string
616 */
617 virtual std::string component_type_declaration() const = 0;
618 /** Method to get the tensor component data type
619 *
620 * @return the tensor component data type
621 */
622 virtual DataType component_data_type() const = 0;
623 /** Method to get the tensor component declarations
624 *
625 * @return a vector containing the tensor component declarations
626 */
627 virtual std::vector<TensorComponent> component_declarations() const = 0;
628 /** Method to get the name of the tensor argument.
629 *
630 * @return the name of the tensor argument
631 */
632 std::string name() const
633 {
634 return _basename;
635 }
636 /** Method to get the tensor format
637 *
638 * @return the format
639 */
640 TensorInfo format() const
641 {
642 return _format;
643 }
644
645protected:
646 TensorInfo _format { };
647 std::string _basename {};
648};
649
650enum class GpuTensorStorage : int32_t
651{
652 Unknown = 0x0000,
653 BufferUint8Ptr = 0x0012,
654 Image2dReadOnly = 0x0020,
655 Image2dWriteOnly = 0x0021,
656 Image3dReadOnly = 0x0030,
657 Image3dWriteOnly = 0x0031
658};
659
660class IGpuTensorArgument : public ITensorArgument
661{
662public:
663 virtual ~IGpuTensorArgument() = default;
664 /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
665 *
666 * @param[in] x tensor storage to query
667 *
668 * @return the tensor storage as a string
669 */
670 virtual std::string storage(GpuTensorStorage x) = 0;
671 /** Method to get the tensor storage type declaration as a string
672 *
673 * @param[in] x tensor component to query
674 *
675 * @return the tensor storage type declaration as a string
676 */
677 virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
678 /** Method to get the tensor storage declarations
679 *
680 * @return a vector containing the tensor storage declarations
681 */
682 virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
683};
684
685class ClTensorArgument : public IGpuTensorArgument
686{
687public:
688 ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
689 {
690 _basename = name;
691 _format = x;
692 _return_by_value_when_possible = return_by_value_when_possible;
693 }
694
695 // Methods to override
696 std::string component(TensorComponent x) override
697 {
698 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
699 {
700 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
701 return std::to_string(idx - 1);
702 }
703
704 if(_return_by_value_when_possible)
705 {
706 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
707 {
708 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
709 return std::to_string(_format.shape[idx]);
710 }
711
712 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
713 {
714 switch(x)
715 {
716 case TensorComponent::Dim1xDim2:
717 return std::to_string(_format.shape[1] * _format.shape[2]);
718 case TensorComponent::Dim1xDim2xDim3:
719 return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
720 default:
721 std::cout << "Unsupported folded dimension" << std::endl;
722 assert(false);
723 }
724 }
725 }
726
727 if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
728 {
729 _components_required.push_back(x);
730 }
731
732 return build_component_name(x);
733 }
734
735 std::string component_type_declaration() const override
736 {
737 return "int";
738 };
739
740 DataType component_data_type() const override
741 {
742 return DataType::Int32;
743 }
744
745 std::string storage(GpuTensorStorage x) override
746 {
747 if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
748 {
749 _storage_required.push_back(x);
750 }
751
752 return build_storage_name(x);
753 }
754
755 std::string storage_type_declaration(GpuTensorStorage x) const override
756 {
757 switch(x)
758 {
759 case GpuTensorStorage::BufferUint8Ptr:
760 return "__global uchar*";
761 case GpuTensorStorage::Image2dReadOnly:
762 return "__read_only image2d_t";
763 case GpuTensorStorage::Image2dWriteOnly:
764 return "__write_only image2d_t";
765 case GpuTensorStorage::Image3dReadOnly:
766 return "__read_only image3d_t ";
767 case GpuTensorStorage::Image3dWriteOnly:
768 return "__write_only image3d_t ";
769 default:
770 std::cout << "Unsupported storage" << std::endl;
771 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +0100772 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +0100773 }
774 };
775
776 std::vector<GpuTensorStorage> storage_declarations() const override
777 {
778 return _storage_required;
779 }
780
781 std::vector<TensorComponent> component_declarations() const override
782 {
783 return _components_required;
784 }
785
786private:
787 std::string build_storage_name(GpuTensorStorage x) const
788 {
789 std::string var_name = _basename;
790
791 switch(x)
792 {
793 case GpuTensorStorage::BufferUint8Ptr:
794 return var_name + "_ptr";
795 case GpuTensorStorage::Image2dReadOnly:
796 case GpuTensorStorage::Image2dWriteOnly:
797 return var_name + "_img2d";
798 case GpuTensorStorage::Image3dReadOnly:
799 case GpuTensorStorage::Image3dWriteOnly:
800 return var_name + "_img3d";
801 default:
802 std::cout << "Unsupported storage" << std::endl;
803 assert(false);
804 }
805
806 return var_name;
807 }
808
809 std::string build_component_name(TensorComponent x) const
810 {
811 std::string var_name = _basename;
812
813 switch(x)
814 {
815 case TensorComponent::OffsetFirstElement:
816 return var_name + "_offset_first_element";
817 case TensorComponent::Stride1:
818 return var_name + "_stride1";
819 case TensorComponent::Stride2:
820 return var_name + "_stride2";
821 case TensorComponent::Stride3:
822 return var_name + "_stride3";
823 case TensorComponent::Dim0:
824 return var_name + "_dim0";
825 case TensorComponent::Dim1:
826 return var_name + "_dim1";
827 case TensorComponent::Dim2:
828 return var_name + "_dim2";
829 case TensorComponent::Dim3:
830 return var_name + "_dim3";
831 case TensorComponent::Dim1xDim2:
832 return var_name + "_dim1xdim2";
833 case TensorComponent::Dim1xDim2xDim3:
834 return var_name + "_dim1xdim2xdim3";
835 default:
836 std::cout << "Unsupported component" << std::endl;
837 assert(false);
838 }
839
840 return var_name;
841 }
842
843 bool _return_by_value_when_possible { false };
844 std::vector<GpuTensorStorage> _storage_required {};
845 std::vector<TensorComponent> _components_required {};
846};
847
848/**
849 * @brief Data structure that contains the declared tiles by the components.
850 * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
851 * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
852 * and remove (pop) all the tiles from the level above.
853 * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
854 * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
855 * when declaring tiles among different components.
856 *
857 */
858class GpuTileRegistry
859{
860public:
861enum class RegistryTileType
862{
863 Tile,
864 Link
865};
866
867using RegistryIdSpace = int32_t;
868using RegistryLevel = int32_t;
869using RegistryTileName = std::string;
870
871struct RegistryTileTableEntry
872{
873 RegistryLevel registry_level { 0 };
874 std::unique_ptr<IVectorTile> tile_object { nullptr };
875};
876
877struct RegistryTileTypeTableEntry
878{
879 RegistryTileType tile_type { RegistryTileType::Tile };
880 RegistryTileName tile_name {};
881 RegistryIdSpace registry_idspace { 0 };
882 RegistryLevel registry_level { 0 };
883};
884
885using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
886using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
887 /**
888 * @brief Construct a new Gpu Tile Registry object
889 *
890 */
891 GpuTileRegistry()
892 {
893 _language = GpuTargetLanguage::Unknown;
894 }
895 /**
896 * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
897 *
898 * @param[in] language Gpu programming language to use
899 */
900 GpuTileRegistry(GpuTargetLanguage language)
901 {
902 _language = language;
903 }
904 /**
905 * @brief Default destructor. Destroy the Gpu Tile Registry object
906 *
907 */
908 ~GpuTileRegistry() = default;
909 /**
910 * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
911 * Therefore, the IdSpace should be set before declaring any tiles.
912 *
913 * @param[in] id The IdSpace id
914 */
915 void set_IdSpace(int32_t id)
916 {
917 _IdSpace = id;
918 }
919 /**
920 * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
921 *
922 * @return The IdSpace id
923 */
924 int32_t IdSpace() const
925 {
926 return _IdSpace;
927 }
928 /**
929 * @brief Gets all the IdSpace declarations defined in the tile registry.
930 *
931 * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
932 */
933 std::vector<int32_t> IdSpace_declarations() const
934 {
935 std::vector<int32_t> x;
936
937 auto it = _frags.begin();
938
939 while (it != _frags.end())
940 {
941 x.push_back(it->first);
942
943 it++;
944 }
945
946 return x;
947 }
948 /**
949 * @brief Declare a tile from a previously created tile
950 */
951 void insert(const std::string& name, const IVectorTile *frag)
952 {
953 assert(_language == GpuTargetLanguage::OpenCL);
954 const int32_t key_IdSpace = _IdSpace;
955 const std::string key_var_name = name;
956 const std::string var_name = frag->name();
957 TileInfo format = frag->format();
958
959 // First check whether a tile with the same name exists
960 IVectorTile *result = (*this)[key_var_name];
961 assert(result == nullptr);
962 if(result == nullptr)
963 {
964 std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
965
966 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
967 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
968
969 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link;
970 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
971 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
972 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
973 }
974 }
975 /**
976 * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
977 *
978 * @note The reference name used for declaring the tile should not be previously used in the IdSpace
979 *
980 * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
981 * @param[in] format Tile format use to use
982 */
983 void insert(const std::string& name, const TileInfo& format)
984 {
985 assert(_language == GpuTargetLanguage::OpenCL);
986 const int32_t key_IdSpace = _IdSpace;
987 const std::string key_var_name = name;
988 const std::string var_name = generate_tile_name(name);
989
990 // First check whether a tile with the same name exists
991 IVectorTile *result = (*this)[key_var_name];
992 assert(result == nullptr);
993 if(result == nullptr)
994 {
995 std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
996 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
997 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
998
999 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
1000 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
1001 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
1002 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
1003 }
1004 }
1005 /**
1006 * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
1007 *
1008 * @note The reference name used for declaring the tile should not be previously used in the IdSpace
1009 *
1010 * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
1011 * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
1012 * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
1013 * that the data type is aligned with the content of the std::string.
1014 */
1015 void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
1016 {
1017 assert(_language == GpuTargetLanguage::OpenCL);
1018 const int32_t key_IdSpace = _IdSpace;
1019 const std::string key_var_name = name;
1020
1021 // First check whether a tile with the same name exists
1022 IVectorTile *result = (*this)[key_var_name];
1023 assert(result == nullptr);
1024 if(result == nullptr)
1025 {
1026 std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
1027 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
1028 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
1029
1030 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
1031 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
1032 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
1033 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
1034 }
1035 }
1036 /**
1037 * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
1038 *
1039 * @note This method can be used to declare temporary tiles that need to be accessed only once.
1040 *
1041 * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
1042 * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
1043 * that the data type is aligned with what passed with the std::string.
1044 *
1045 * @return IVectorTile* the anonymous constant tile
1046 */
1047 IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
1048 {
1049 assert(_language == GpuTargetLanguage::OpenCL);
1050 const int32_t key_IdSpace = _IdSpace;
1051 const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
1052
1053 // First check whether a tile with the same name exists
1054 IVectorTile *result = (*this)[key_var_name];
1055 assert(result == nullptr);
1056 if(result == nullptr)
1057 {
1058 std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
1059 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
1060 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
1061
1062 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
1063 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
1064 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
1065 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
1066 }
1067
1068 return (*this)[key_var_name];
1069 }
1070 /**
1071 * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
1072 *
1073 * @param[in] name The name of the tile to retrieve
1074 * @param[in] IdSpace The IdSpace id where to search the tile
1075 *
1076 * @return IVectorTile* The tile
1077 */
1078 IVectorTile* get(const std::string& name, int32_t IdSpace)
1079 {
1080 const int32_t key_IdSpace = IdSpace;
1081 const std::string key_var_name = name;
1082
1083 IVectorTile* result = nullptr;
1084 auto search_IdSpace = _frags.find(key_IdSpace);
1085 if(search_IdSpace != _frags.end())
1086 {
1087 auto search_tile = _frags[key_IdSpace].find(key_var_name);
1088 if(search_tile != _frags[key_IdSpace].end())
1089 {
1090 result = search_tile->second.tile_object.get();
1091 assert(result != nullptr);
1092 }
1093 }
1094
1095 return result;
1096 }
1097 /**
1098 * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
1099 *
1100 * @param[in] name The name of the tile to retrieve
1101 *
1102 * @return IVectorTile* The tile
1103 */
1104 IVectorTile* operator[](const std::string& name)
1105 {
1106 return get(name, _IdSpace);
1107 }
1108 /**
1109 * @brief Check whether the tile in the in the IdSpace provided by the user exists
1110 *
1111 * @param[in] name Name of the tile to search for
1112 * @param[in] IdSpace The IdSpace id where to search the tile
1113 *
1114 * @return true if the tile exists
1115 * @return false if the tile does not exist
1116 */
1117 bool has_tile(const std::string& name, int32_t IdSpace) const
1118 {
1119 const int32_t key_IdSpace = IdSpace;
1120 const std::string key_var_name = name;
1121
1122 // IVectorTile* result = nullptr;
1123 auto search_IdSpace = _frags.find(key_IdSpace);
1124
1125 return search_IdSpace != _frags.end();
1126 }
1127 /**
1128 * @brief Check whether the tile within the current IdSpace exists
1129 *
1130 * @param[in] name Name of the tile to search for
1131 *
1132 * @return true if the tile exists
1133 * @return false if the tile does not exist
1134 */
1135 bool has_tile(const std::string& name) const
1136 {
1137 return has_tile(name, _IdSpace);
1138 }
1139 /**
1140 * @brief Get all the tiles declared within the IdSpace provided by the user
1141 *
1142 * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
1143 *
1144 * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
1145 */
1146 std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
1147 {
1148 std::vector<IVectorTile*> tiles;
1149
1150 std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
1151
1152 while (it != _frag_types[IdSpace].end())
1153 {
1154 // The following line should be enabled. However, we cannot at this stage
1155 // because it used to retrieve the output tile produced by each component.
1156 // However, this method should NOT be used to retrieve the output tile
1157 //if(it->second.tile_type == RegistryTileType::Tile)
1158 {
1159 tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
1160 }
1161 it++;
1162 }
1163
1164 return tiles;
1165 }
1166 /**
1167 * @brief Increase the level of stack.
1168 *
1169 */
1170 void increment_registry_level()
1171 {
1172 _registry_level++;
1173 }
1174 /**
1175 * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
1176 *
1177 */
1178 void decrement_registry_level()
1179 {
1180 assert(_registry_level >= 0);
1181
1182 // Remove all variables in the local scope
1183 std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
1184
1185 while (it != _frags[_IdSpace].end())
1186 {
1187 if (it->second.registry_level == _registry_level)
1188 {
1189 it = _frags[_IdSpace].erase(it);
1190 }
1191 else
1192 {
1193 it++;
1194 }
1195 }
1196
1197 std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
1198
1199 while (it_type != _frag_types[_IdSpace].end())
1200 {
1201 if (it_type->second.registry_level == _registry_level)
1202 {
1203 it_type = _frag_types[_IdSpace].erase(it_type);
1204 }
1205 else
1206 {
1207 it_type++;
1208 }
1209 }
1210
1211 _registry_level--;
1212 }
1213 /**
1214 * @brief Get the level of the stack
1215 *
1216 */
1217 int32_t level() const
1218 {
1219 return _registry_level;
1220 }
1221
1222private:
1223 // This method ensures that the key is unique among different components
1224 std::string generate_tile_name(const std::string& name)
1225 {
1226 assert(_IdSpace >= 0 );
1227 if(_registry_level == 0)
1228 {
1229 return "_G" + std::to_string(_IdSpace) + "_" + name;
1230 }
1231 else
1232 {
1233 return name;
1234 }
1235 }
1236 RegistryTileTable _frags {};
1237 RegistryTileTypeTable _frag_types {};
1238 RegistryLevel _registry_level { 0 };
1239 RegistryIdSpace _IdSpace { -1 };
1240 int32_t _anonymous_frag_count { 0 }; // Counter used to create the anonymous tiles
1241 GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
1242};
1243
1244using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
1245
1246/**
1247 * @brief Data structure that contains the tensors consumed by the components.
1248 * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
1249 * when declaring tensors among different components.
1250 *
1251 */
1252class GpuTensorArgumentRegistry
1253{
1254public:
1255 /**
1256 * @brief Construct a new Gpu Tensor Registry object
1257 *
1258 */
1259 GpuTensorArgumentRegistry()
1260 {
1261 _language = GpuTargetLanguage::Unknown;
1262 }
1263 /**
1264 * @brief Construct a new Gpu Tensor Registry object
1265 *
1266 * @param[in] language Gpu programming language to use
1267 */
1268 GpuTensorArgumentRegistry(GpuTargetLanguage language)
1269 {
1270 _language = language;
1271 }
1272 /**
1273 * @brief Default destructor. Destroy the Gpu Tensor Registry object
1274 *
1275 */
1276 ~GpuTensorArgumentRegistry() = default;
1277 /**
1278 * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
1279 * Therefore, the IdSpace should be set before declaring any tensors.
1280 *
1281 * @param[in] id The IdSpace id
1282 */
1283 void set_IdSpace(int32_t id)
1284 {
1285 _IdSpace = id;
1286 }
1287 /**
1288 * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
1289 *
1290 * @return The IdSpace id
1291 */
1292 int32_t IdSpace() const
1293 {
1294 return _IdSpace;
1295 }
1296 /**
1297 * @brief Gets all the IdSpace declarations defined in the tensor registry.
1298 *
1299 * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
1300 */
1301 std::vector<int32_t> IdSpace_declarations() const
1302 {
1303 std::vector<int32_t> x;
1304
1305 auto it = _refs.begin();
1306
1307 while (it != _refs.end())
1308 {
1309 x.push_back(it->first);
1310
1311 it++;
1312 }
1313
1314 return x;
1315 }
1316 /**
1317 * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
1318 *
1319 * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
1320 *
1321 * @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
1322 * @param[in] x Pair of tensor info and tensor id
1323 * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
1324 */
1325 void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
1326 {
1327 assert(_language == GpuTargetLanguage::OpenCL);
1328 const int32_t key_IdSpace = _IdSpace;
1329 const int32_t tensor_id = x.id;
1330 const std::string key_var_name = name;
1331 const std::string var_name = generate_tensor_name(name, tensor_id);
1332
1333 // First, check whether the tensor has already a reference. If so, trigger an assert
1334 assert(!has_tensor_argument(name));
1335
1336 // Check whether a tensor with that tensorID exists
1337 auto result = _tensor_arguments.find(tensor_id);
1338 if(result == _tensor_arguments.end())
1339 {
1340 // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
1341 std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
1342 _tensor_arguments[tensor_id] = std::move(arg);
1343 }
1344
1345 _refs[key_IdSpace][key_var_name] = tensor_id;
1346 }
1347 /**
1348 * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
1349 *
1350 * @param[in] name The name of the tensor to retrieve
1351 *
1352 * @return IGpuTensor* The tensor
1353 */
1354 IGpuTensorArgument* operator[](const std::string& name)
1355 {
1356 const int32_t key_IdSpace = _IdSpace;
1357 const std::string key_var_name = name;
1358
1359 IGpuTensorArgument* result = nullptr;
1360 auto search_IdSpace = _refs.find(key_IdSpace);
1361 if(search_IdSpace != _refs.end())
1362 {
1363 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1364
1365 if(search_tensor_id != _refs[key_IdSpace].end())
1366 {
1367 const int32_t tensor_id = search_tensor_id->second;
1368 auto search_tensor_argument = _tensor_arguments.find(tensor_id);
1369 if(search_tensor_argument != _tensor_arguments.end())
1370 {
1371 result = search_tensor_argument->second.get();
1372 }
1373 assert(result != nullptr);
1374 }
1375 }
1376
1377 return result;
1378 }
1379 /**
1380 * @brief Get all the tensors declared in the IdSpace provided by the user
1381 *
1382 * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
1383 */
1384 std::vector<IGpuTensorArgument*> tensor_argument_declarations()
1385 {
1386 std::vector<IGpuTensorArgument*> args;
1387
1388 auto it = _tensor_arguments.begin();
1389
1390 while (it != _tensor_arguments.end())
1391 {
1392 args.push_back(it->second.get());
1393 it++;
1394 }
1395
1396 return args;
1397 }
1398 /**
1399 * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
1400 *
1401 * @param[in] name Name of the tensor argument to search for
1402 *
1403 * @return true if the tensor argument exists
1404 * @return false if the tensor argument does not exist
1405 */
1406 bool has_tensor_argument(const std::string& name)
1407 {
1408 const int32_t key_IdSpace = _IdSpace;
1409 const std::string key_var_name = name;
1410
1411 auto search_IdSpace = _refs.find(key_IdSpace);
1412
1413 if(search_IdSpace != _refs.end())
1414 {
1415 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1416
1417 return search_tensor_id != _refs[key_IdSpace].end();
1418 }
1419 else
1420 {
1421 return false;
1422 }
1423 }
1424 /**
1425 * @brief Check whether the tensor argument is in the the IdSpace provided by the user
1426 *
1427 * @param[in] name Name of the tensor argument to search for
1428 * @param[in] IdSpace The IdSpace id where to search the tensor argument
1429 *
1430 * @return true if the tile exists
1431 * @return false if the tile does not exist
1432 */
1433 bool has_tensor_argument(const std::string& name, int32_t IdSpace)
1434 {
1435 const int32_t key_IdSpace = IdSpace;
1436 const std::string key_var_name = name;
1437
1438 auto search_IdSpace = _refs.find(key_IdSpace);
1439
1440 if(search_IdSpace != _refs.end())
1441 {
1442 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1443
1444 return search_tensor_id != _refs[key_IdSpace].end();
1445 }
1446 else
1447 {
1448 return false;
1449 }
1450 }
1451private:
1452 // This method ensures that the key is unique among different components
1453 std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
1454 {
1455 assert(tensor_id >= 0 );
1456
1457 return name + std::to_string(tensor_id);
1458 }
1459
1460 std::map<int32_t, TensorEntry> _tensor_arguments {};
1461 std::map<int32_t, std::map<std::string, int32_t>> _refs {};
1462 int32_t _IdSpace { -1 };
1463 GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
1464};
1465
1466enum class OpType : int32_t
1467{
1468 Elementwise = 0x0000,
1469 Relational = 0x1000,
1470 Algebra = 0x2000
1471};
1472
1473inline std::string to_string(AssignmentOp op)
1474{
1475 switch(op)
1476 {
1477 case AssignmentOp::Decrement:
1478 return "-=";
1479 case AssignmentOp::Increment:
1480 return "+=";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001481 default:
1482 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01001483 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001484 }
1485}
1486
1487inline std::string to_string(BinaryOp op)
1488{
1489 switch(op)
1490 {
1491 case BinaryOp::Add:
1492 return "+";
1493 case BinaryOp::Sub:
1494 return "-";
1495 case BinaryOp::Mul:
1496 return "*";
1497 case BinaryOp::Div:
1498 return "/";
1499 case BinaryOp::Mod:
1500 return "%";
1501 case BinaryOp::Equal:
1502 return "==";
1503 case BinaryOp::Less:
1504 return "<";
1505 case BinaryOp::LessEqual:
1506 return "<=";
1507 case BinaryOp::Greater:
1508 return ">";
1509 case BinaryOp::GreaterEqual:
1510 return ">=";
1511 case BinaryOp::LogicalAnd:
1512 return "&&";
1513 case BinaryOp::LogicalOr:
1514 return "||";
1515 case BinaryOp::LogicalNot:
1516 return "!";
1517 default:
1518 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01001519 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001520 }
1521}
1522
1523inline std::string binary_op_string(BinaryOp op)
1524{
1525 switch(op)
1526 {
1527 case BinaryOp::Add:
1528 return "add";
1529 case BinaryOp::Sub:
1530 return "sub";
1531 case BinaryOp::Mul:
1532 return "mul";
1533 case BinaryOp::Div:
1534 return "div";
1535 case BinaryOp::Mod:
1536 return "mod";
1537 case BinaryOp::Equal:
1538 return "eq";
1539 case BinaryOp::Less:
1540 return "gt";
1541 case BinaryOp::LessEqual:
1542 return "gteq";
1543 case BinaryOp::Greater:
1544 return "lt";
1545 case BinaryOp::GreaterEqual:
1546 return "lte";
1547 default:
1548 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01001549 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001550 }
1551}
1552
1553enum class OperandType : int32_t
1554{
1555 Unknown = 0x00000000,
1556 ScalarFp32 = 0x00001011, // Immediate scalar tile
1557 ScalarFp16 = 0x00001012, // Immediate scalar tile
1558 ScalarInt32 = 0x00001021, // Immediate scalar tile
1559 ScalarInt16 = 0x00001022, // Immediate scalar tile
1560 ScalarInt8 = 0x00001024, // Immediate scalar tile
1561 ScalarUInt32 = 0x00001031, // Immediate scalar tile
1562 ScalarUInt16 = 0x00001032, // Immediate scalar tile
1563 ScalarUInt8 = 0x00001034, // Immediate scalar tile
1564 ScalarBool = 0x00001041, // Immediate scalar tile
1565 ScalarTile = 0x00001050, // Scalar from a tile
1566 Tile = 0x00010000, // Tile
1567 TensorStride1 = 0x00100001, // Tensor component
1568 TensorStride2 = 0x00100002, // Tensor component
1569 TensorStride3 = 0x00100003, // Tensor component
1570 TensorStride4 = 0x00100004, // Tensor component
1571 TensorDim0 = 0x00100010, // Tensor component
1572 TensorDim1 = 0x00100020, // Tensor component
1573 TensorDim2 = 0x00100030, // Tensor component
1574 TensorDim3 = 0x00100040, // Tensor component
1575 TensorDim4 = 0x00100050, // Tensor component
1576 TensorC = 0x00100010, // Tensor component
1577 TensorW = 0x00100020, // Tensor component
1578 TensorH = 0x00100030, // Tensor component
1579 TensorD = 0x00100040, // Tensor component
1580 TensorN = 0x00100050, // Tensor component
1581 TensorDim1xDim2 = 0x00100100, // Tensor component
1582 TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
1583 TensorWxH = 0x00100300, // Tensor component
1584 TensorWxHxD = 0x00100400, // Tensor component
1585 TensorDataOffset = 0x00100500, // Tensor component
1586};
1587
1588struct ScalarTileCoord
1589{
1590 ScalarTileCoord() {}
1591 ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
1592 int32_t x { -1 };
1593 int32_t y { -1 };
1594};
1595/**
1596 * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
1597 * Operand can be of three types:
1598 * -# Scalar immediate: constant expression
1599 * -# Tile: A tile
1600 * -# Tensor component: A component (scalar) of a tensor
1601 *
1602 */
1603class Operand
1604{
1605public:
1606 Operand(const std::string &val)
1607 {
1608 _str = val;
1609 _type = OperandType::Tile;
1610 }
1611
1612 Operand(const std::string &val, const ScalarTileCoord& coord)
1613 {
1614 _str = val;
1615 _type = OperandType::ScalarTile;
1616 _coord = coord;
1617 }
1618
1619 Operand(const std::string &val, OperandType type)
1620 {
1621 _str = val;
1622 _type = type;
1623 }
1624
1625 Operand(const Operand& t)
1626 {
1627 _str = t.value();
1628 _type = t.type();
1629 }
1630
1631 Operand& operator=(const Operand& t)
1632 {
1633 _str = t.value();
1634 _type = t.type();
1635 _coord = t.scalar_tile_coordinate();
1636 return *this;
1637 }
1638
1639 std::string value() const
1640 {
1641 return _str;
1642 }
1643
1644 OperandType type() const
1645 {
1646 return _type;
1647 }
1648
1649 ScalarTileCoord scalar_tile_coordinate() const
1650 {
1651 return _coord;
1652 }
1653
1654private:
1655 std::string _str {};
1656 OperandType _type { OperandType::Unknown };
1657 ScalarTileCoord _coord {};
1658};
1659
1660enum class GpuSamplerTensorStorage : int32_t
1661{
1662 Unknown = static_cast<int32_t>(GpuTensorStorage::Unknown),
1663 BufferUint8Ptr = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
1664 Image2dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
1665 Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
1666 Image3dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
1667 Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
1668};
1669
1670struct GpuSampler
1671{
1672 GpuSampler() = default;
1673 TensorSamplerFormat format { TensorSamplerFormat::Unknown };
1674 GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
1675 TensorSamplerAddressModeX address_mode_x { TensorSamplerAddressModeX::Unknown };
1676 TensorSamplerAddressModeY address_mode_y { TensorSamplerAddressModeY::Unknown };
1677 TensorSamplerAddressModeZ address_mode_z { TensorSamplerAddressModeZ::Unknown };
1678};
1679
1680inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
1681{
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01001682 CKW_UNUSED(step_x, step_y, step_z);
1683
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001684 auto tensor = tensor_info_id->shape;
1685
1686 GpuSampler dst_sampler;
1687 dst_sampler.format = sampler.format;
1688 dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr;
1689 dst_sampler.address_mode_x = sampler.address_mode_x;
1690 dst_sampler.address_mode_y = sampler.address_mode_y;
1691 dst_sampler.address_mode_z = sampler.address_mode_z;
1692
1693 int32_t dim_x = 0;
1694 int32_t dim_y = 0;
1695 int32_t dim_z = 0;
1696
1697 switch(sampler.format)
1698 {
1699 case TensorSamplerFormat::C_W_H:
1700 dim_x = tensor[0];
1701 dim_y = tensor[1];
1702 dim_z = tensor[2];
1703 break;
1704 case TensorSamplerFormat::C_WH_1:
1705 dim_x = tensor[0];
1706 dim_y = tensor[1] * tensor[2];
1707 dim_z = 1;
1708 break;
1709 default:
1710 std::cout << "Unsupported tensor format" << std::endl;
1711 assert(false);
1712 break;
1713 }
1714
1715 if(dim_x == 1)
1716 {
1717 assert(step_x == 1);
1718 dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
1719 }
1720
1721 if(dim_y == 1)
1722 {
1723 assert(step_y == 1);
1724 dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
1725 }
1726
1727 if(dim_z == 1)
1728 {
1729 assert(step_z == 1);
1730 dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
1731 }
1732
1733 return dst_sampler;
1734}
1735
1736class GpuOutputSampler
1737{
1738public:
1739 GpuOutputSampler() = default;
1740 /**
1741 * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
1742 * by the root component. Once initialized, all simpler components will need to used this sampler
1743 * or a broadcasted version of it
1744 *
1745 * @param[in] sampler GpuSampler
1746 * @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile!
1747 * @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
1748 * @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
1749 */
1750 void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
1751 {
1752 assert(_is_initialized == false);
1753
1754 _step_x = step_x;
1755 _step_y = step_y;
1756 _step_z = step_z;
1757 _tensor_info_id = tensor_info_id;
1758 _sampler = create_sampler(tensor_storage, tensor_format);
1759 _is_initialized = true;
1760 };
1761
1762 GpuSampler sampler() const
1763 {
1764 return _sampler;
1765 };
1766
1767 int32_t step_x() const
1768 {
1769 return _step_x;
1770 };
1771
1772 int32_t step_y() const
1773 {
1774 return _step_y;
1775 };
1776
1777 int32_t step_z() const
1778 {
1779 return _step_z;
1780 };
1781private:
1782 GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
1783 {
1784 // Output can only be in output mode
1785 assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
1786 assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
1787
1788 auto tensor = _tensor_info_id->shape;
1789
1790 GpuSampler sampler;
1791 sampler.format = tensor_format;
1792 sampler.storage = tensor_storage;
1793 sampler.address_mode_x = TensorSamplerAddressModeX::None;
1794 sampler.address_mode_y = TensorSamplerAddressModeY::None;
1795 sampler.address_mode_z = TensorSamplerAddressModeZ::None;
1796
1797 // In the case of texture, we do not need any special checks at the border
1798 if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
1799 {
1800 int32_t dim_x = 0;
1801 int32_t dim_y = 0;
1802 int32_t dim_z = 0;
1803
1804 switch(tensor_format)
1805 {
1806 case TensorSamplerFormat::C_W_H:
1807 dim_x = tensor[0];
1808 dim_y = tensor[1];
1809 dim_z = tensor[2];
1810 break;
1811 case TensorSamplerFormat::C_WH_1:
1812 dim_x = tensor[0];
1813 dim_y = tensor[1] * tensor[2];
1814 dim_z = 1;
1815 break;
1816 default:
1817 std::cout << "Unsupported tensor format" << std::endl;
1818 assert(false);
1819 break;
1820 }
1821
1822 if((dim_x % _step_x) != 0 && dim_x != 1)
1823 {
1824 sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
1825 }
1826
1827 if((dim_y % _step_y) != 0 && dim_y != 1)
1828 {
1829 sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
1830 }
1831
1832 if((dim_z % _step_z) != 0 && dim_z != 1)
1833 {
1834 sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
1835 }
1836 }
1837
1838 return sampler;
1839 }
1840 GpuSampler _sampler { }; // GpuSampler
1841 int32_t _step_x { 1 };
1842 int32_t _step_y { 1 };
1843 int32_t _step_z { 1 };
1844 const TensorInfo* _tensor_info_id { nullptr };
1845 bool _is_initialized { false };
1846};
1847
1848/**
1849 * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
1850 */
1851class TensorOperand
1852{
1853public:
1854 TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
1855 {
1856 }
1857
1858 TensorOperand& operator=(const TensorOperand& t)
1859 {
1860 _str = t.value();
1861 _sampler = t.sampler();
1862 return *this;
1863 }
1864
1865 std::string value() const
1866 {
1867 return _str;
1868 }
1869
1870 GpuSampler sampler() const
1871 {
1872 return _sampler;
1873 }
1874
1875private:
1876 std::string _str {};
1877 GpuSampler _sampler {};
1878};
1879
1880/**
1881 * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
1882 * This data structure must be initialized before being passed to the Gpu Kernel Writer
1883 *
1884 */
1885class GpuKernelWriterDataHolder
1886{
1887public:
1888 /**
1889 * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
1890 * the GPU target and target specific capabilities (extensions). For now, we just initialize the
1891 * programming language
1892 *
1893 * @param[in] language Gpu programming language to use
1894 */
1895 GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
1896 {
1897 }
1898 /**
1899 * @brief Get the Gpu programming language used
1900 *
1901 * @return GpuTargetLanguage the Gpu programming language
1902 */
1903 GpuTargetLanguage programming_language() const
1904 {
1905 return _language;
1906 }
1907 /**
1908 * @brief @ref GpuTileRegistry
1909 *
1910 */
1911 GpuTileRegistry tiles{};
1912 /**
1913 * @brief @ref GpuTensorArgumentRegistry
1914 *
1915 */
1916 GpuTensorArgumentRegistry arguments{};
1917 /**
1918 * @brief @ref GpuOutputSampler.
1919 *
1920 */
1921 GpuOutputSampler output_sampler{};
1922 /**
1923 * @brief Source code
1924 *
1925 */
1926 std::string code{};
1927
1928 // GpuExtensionRegistry extensions{};
1929private:
1930 GpuTargetLanguage _language;
1931};
1932
1933struct LWS
1934{
1935 int32_t x {1};
1936 int32_t y {1};
1937 int32_t z {1};
1938};
1939
1940/**
1941 * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
1942 * declare an anonymous tile in the tile registry.
1943 */
1944class OperandUnpacker
1945{
1946public:
1947 OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
1948 {
1949 // Increase the level of the stack to allocate possible temporary tiles
1950 _tiles.increment_registry_level();
1951 };
1952
1953 ~OperandUnpacker()
1954 {
1955 // Decrease the level of the stack to deallocate any temporary tiles
1956 _tiles.decrement_registry_level();
1957 }
1958
1959 IVectorTile* unpack(const Operand& src)
1960 {
1961 // Get the tile
1962 if(src.type() == OperandType::Tile)
1963 {
1964 assert(_tiles.has_tile(src.value()));
1965 return _tiles[src.value()];
1966 }
1967 // Create an anonymous tile with a constant
1968 else if(static_cast<int32_t>(src.type()) & 0x00001000)
1969 {
1970 if(src.type() == OperandType::ScalarTile)
1971 {
1972 ScalarTileCoord coord = src.scalar_tile_coordinate();
1973 assert(_tiles.has_tile(src.value()));
1974 assert(coord.x >= 0);
1975 assert(coord.y >= 0);
1976 auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
1977 return _tiles.insert({{{val.str}}}, val.type.dt);
1978 }
1979 else
1980 {
1981 return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
1982 }
1983 }
1984 // Create an anonymous tile with the tensor component
1985 else
1986 {
1987 assert(_arguments.has_tensor_argument(src.value()));
1988 auto x = _arguments[src.value()];
1989 const std::string val = x->component(to_tensor_component(src.type()));
1990 const DataType dt = x->component_data_type();
1991 return _tiles.insert({{{val}}}, dt);
1992 }
1993 }
1994
1995private:
1996 DataType to_tile_data_type(OperandType x)
1997 {
1998 return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
1999 }
2000
2001 TensorComponent to_tensor_component(OperandType x)
2002 {
2003 switch(x)
2004 {
2005 case OperandType::TensorDim0:
2006 return TensorComponent::Dim0;
2007 case OperandType::TensorDim1:
2008 return TensorComponent::Dim1;
2009 case OperandType::TensorDim2:
2010 return TensorComponent::Dim2;
2011 case OperandType::TensorDim3:
2012 return TensorComponent::Dim3;
2013 case OperandType::TensorDim4:
2014 return TensorComponent::Dim4;
2015 case OperandType::TensorStride1:
2016 return TensorComponent::Stride1;
2017 case OperandType::TensorStride2:
2018 return TensorComponent::Stride2;
2019 case OperandType::TensorStride3:
2020 return TensorComponent::Stride3;
2021 case OperandType::TensorStride4:
2022 return TensorComponent::Stride4;
2023 case OperandType::TensorDim1xDim2:
2024 return TensorComponent::Dim1xDim2;
2025 case OperandType::TensorDim1xDim2xDim3:
2026 return TensorComponent::Dim1xDim2xDim3;
2027 case OperandType::TensorDataOffset:
2028 return TensorComponent::OffsetFirstElement;
2029 default:
2030 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002031 return TensorComponent::Unknown;
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002032 }
2033 }
2034
2035 GpuTileRegistry& _tiles;
2036 GpuTensorArgumentRegistry& _arguments;
2037};
2038
2039/**
2040 * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
2041 * declare an anonymous tile in the tile registry.
2042 * Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
2043 */
2044class TensorOperandUnpacker
2045{
2046public:
2047 TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
2048 {
2049 };
2050
2051 IGpuTensorArgument* unpack(const TensorOperand& src)
2052 {
2053 assert(_arguments.has_tensor_argument(src.value()));
2054 return _arguments[src.value()];
2055 }
2056
2057private:
2058 GpuTensorArgumentRegistry& _arguments;
2059};
2060
2061/**
2062 * @brief The GpuKernel will be used in three occasions (stages):
2063 * #- Compilation stage
2064 * #- Tuning stage
2065 * #- Dispatch stage
2066 */
2067struct GpuKernel
2068{
2069 // Compilation stage
2070 std::string code {}; // Source code, required for the compilation stage
2071 std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
2072 // Tuning stage
2073 std::string config_id {}; // Unique id, required for the tuning stage
2074 std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage
2075 // Dispatch stage
2076 GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
2077 std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages; // List of tensor storages, required for the dispatch stage
2078 std::vector<std::pair<int32_t, TensorComponent>> list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
2079};
2080
2081// This function should produce an object with the source
2082inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
2083{
2084 std::string code;
2085 code += "__kernel void ";
2086 code += name;
2087 code += "(\n";
2088
2089 auto IdSpaces = in.arguments.IdSpace_declarations();
2090
2091 std::vector<std::string> arg_str;
2092
2093 auto tensor_args = in.arguments.tensor_argument_declarations();
2094
2095 for(auto &i : tensor_args)
2096 {
2097 // For each tensor used, get the storage and tensor components
2098 auto storages = i->storage_declarations();
2099 auto components = i->component_declarations();
2100
2101 for(auto &y : storages)
2102 {
2103 std::string str;
2104 str += i->storage_type_declaration(y);
2105 str += " ";
2106 str += i->storage(y);
2107 arg_str.push_back(str);
2108 }
2109
2110 for(auto &y : components)
2111 {
2112 std::string str;
2113 str += i->component_type_declaration();
2114 str += " ";
2115 str += i->component(y);
2116 arg_str.push_back(str);
2117 }
2118 }
2119
2120 for(size_t i = 0; i < arg_str.size(); ++i)
2121 {
2122 code += arg_str[i];
2123 if(i + 1 < arg_str.size())
2124 {
2125 code += ",\n";
2126 }
2127 }
2128
2129 code += ")\n";
2130 code += "{\n";
2131 code += in.code;
2132 code += "}\n";
2133
2134 return code;
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002135}
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002136
2137/**
2138 * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
2139 * how to reduce the dimensionality of a tensor
2140 *
2141 */
2142class GpuTensor3dMapper
2143{
2144public:
2145 GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
2146 {
2147 };
2148
2149 std::string tensor_component_x() const
2150 {
2151 const auto format = _sampler.format;
2152 switch(format)
2153 {
2154 case TensorSamplerFormat::C_WH_1:
2155 case TensorSamplerFormat::C_W_H:
2156 return _tensor->component(TensorComponent::C);
2157 default:
2158 std::cout << "Unsupported tensor format" << std::endl;
2159 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002160 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002161 }
2162 }
2163
2164 std::string tensor_component_y() const
2165 {
2166 const auto format = _sampler.format;
2167 switch(format)
2168 {
2169 case TensorSamplerFormat::C_WH_1:
2170 return _tensor->component(TensorComponent::WxH);
2171 case TensorSamplerFormat::C_W_H:
2172 return _tensor->component(TensorComponent::W);
2173 default:
2174 std::cout << "Unsupported tensor format" << std::endl;
2175 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002176 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002177 }
2178 }
2179
2180 std::string tensor_component_z() const
2181 {
2182 const auto format = _sampler.format;
2183 switch(format)
2184 {
2185 case TensorSamplerFormat::C_WH_1:
2186 return "1";
2187 case TensorSamplerFormat::C_W_H:
2188 return _tensor->component(TensorComponent::H);
2189 default:
2190 std::cout << "Unsupported tensor format" << std::endl;
2191 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002192 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002193 }
2194 }
2195
2196 std::string tensor_component_stride_y() const
2197 {
2198 const auto format = _sampler.format;
2199 switch(format)
2200 {
2201 case TensorSamplerFormat::C_WH_1:
2202 case TensorSamplerFormat::C_W_H:
2203 return _tensor->component(TensorComponent::Stride1);
2204 default:
2205 std::cout << "Unsupported tensor format" << std::endl;
2206 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002207 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002208 }
2209 }
2210
2211 std::string tensor_component_stride_z() const
2212 {
2213 const auto format = _sampler.format;
2214 switch(format)
2215 {
2216 case TensorSamplerFormat::C_WH_1:
2217 return "0";
2218 case TensorSamplerFormat::C_W_H:
2219 return _tensor->component(TensorComponent::Stride2);
2220 default:
2221 std::cout << "Unsupported tensor format" << std::endl;
2222 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002223 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002224 }
2225 }
2226
2227 std::string tensor_component_stride_batch() const
2228 {
2229 const auto format = _sampler.format;
2230 switch(format)
2231 {
2232 case TensorSamplerFormat::C_WH_1:
2233 case TensorSamplerFormat::C_W_H:
2234 return _tensor->component(TensorComponent::Stride3);
2235 default:
2236 std::cout << "Unsupported tensor format" << std::endl;
2237 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002238 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002239 }
2240 }
2241
2242 bool is_one_component_x() const
2243 {
2244 auto t = _tensor->format();
2245 const auto format = _sampler.format;
2246 switch(format)
2247 {
2248 case TensorSamplerFormat::C_WH_1:
2249 case TensorSamplerFormat::C_W_H:
2250 return t.shape[0] == 1;
2251 default:
2252 std::cout << "Unsupported tensor format" << std::endl;
2253 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002254 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002255 }
2256 }
2257
2258 bool is_one_component_y() const
2259 {
2260 auto t = _tensor->format();
2261 const auto format = _sampler.format;
2262 switch(format)
2263 {
2264 case TensorSamplerFormat::C_WH_1:
2265 return (t.shape[1] * t.shape[2]) == 1;
2266 case TensorSamplerFormat::C_W_H:
2267 return t.shape[1] == 1;
2268 default:
2269 std::cout << "Unsupported tensor format" << std::endl;
2270 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002271 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002272 }
2273 }
2274
2275 bool is_one_component_z() const
2276 {
2277 auto t = _tensor->format();
2278 const auto format = _sampler.format;
2279 switch(format)
2280 {
2281 case TensorSamplerFormat::C_WH_1:
2282 return true;
2283 case TensorSamplerFormat::C_W_H:
2284 return t.shape[2] == 1;
2285 default:
2286 std::cout << "Unsupported tensor format" << std::endl;
2287 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002288 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002289 }
2290 }
2291
2292 bool is_one_component_batch() const
2293 {
2294 auto t = _tensor->format();
2295 const auto format = _sampler.format;
2296 switch(format)
2297 {
2298 case TensorSamplerFormat::C_WH_1:
2299 case TensorSamplerFormat::C_W_H:
2300 return t.shape[3] == 1;
2301 default:
2302 std::cout << "Unsupported tensor format" << std::endl;
2303 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002304 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002305 }
2306 }
2307
2308 GpuSampler gpu_sampler() const
2309 {
2310 return _sampler;
2311 }
2312
2313 IGpuTensorArgument* tensor_argument() const
2314 {
2315 return _tensor;
2316 }
2317
2318private:
2319 GpuSampler _sampler;
2320 IGpuTensorArgument* _tensor;
2321};
2322
2323struct GpuKernelWriterAttribute
2324{
2325 bool return_tensor_component_by_value { false };
2326};
2327
2328enum class ConvertPolicy
2329{
2330 Wrap, /**< Wrap around */
2331 Saturate /**< Saturate */
2332};
2333
2334enum class RoundingMode
2335{
2336 None,
2337 Rte,
2338 Rtz,
2339 Rtp,
2340 Rtn
2341};
2342
2343// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
2344class IGpuKernelWriter
2345{
2346public:
2347 virtual ~IGpuKernelWriter() = default;
2348 virtual void set_IdSpace(int32_t id) = 0;
2349 virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
2350 virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
2351 virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
2352 virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
2353 virtual void write_text(const std::string& x) = 0;
2354 virtual void compound_statement_begin() = 0;
2355 virtual void compound_statement_end() = 0;
2356
2357 // Operations
2358 virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
2359 virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
2360 virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
2361 virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
2362 virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
2363 virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
2364 virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
2365 virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
2366 virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
2367 virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
2368 virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
2369 virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
2370 virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
2371 virtual void op_return() = 0;
2372 // virtual void op_else() = 0;
2373 // virtual void op_elseif() = 0;
2374 // Utils
2375 // It is the process of converting
2376 virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
2377};
2378
2379enum class GpuLoadStoreType
2380{
2381 Load = 1,
2382 Store = 2
2383};
2384
2385class IGpuLoadStoreHelperWriter
2386{
2387public:
2388 IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
2389 IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
2390 IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
2391 virtual ~IGpuLoadStoreHelperWriter() = default;
2392 virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
2393 virtual void write(const std::pair<int32_t, std::string>& y) = 0;
2394 virtual void finalize() = 0;
2395protected:
2396 IGpuKernelWriter* _writer;
2397 GpuTensor3dMapper _mapper;
2398 GpuLoadStoreType _type;
2399};
2400
2401class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
2402{
2403public:
2404 ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
2405 {
2406 }
2407
2408 ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
2409 ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
2410
2411 static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
2412 {
2413 CKW_UNUSED(x, type, dst);
2414
2415 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
2416 {
2417 return false;
2418 }
2419 return true;
2420 }
2421
2422 void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
2423 {
2424 assert(validate(_writer, _mapper, _type, dst));
2425
2426 _dst = dst;
2427 _ls_width_full = dst->format().w;
2428
2429 _coord_x = x->scalar(0, 0).str;
2430 _coord_z = z->scalar(0, 0).str;
2431 _coord_b = b->scalar(0, 0).str;
2432 _coord_orig_z = _coord_z;
2433
2434 out_of_bound_initialize_x(_coord_x);
2435 out_of_bound_initialize_z(_coord_z);
2436
2437 /*
2438 meaning of else:
2439 - x: partial load/store
2440 - y: no load/store operation
2441 - z: no load/store operation
2442 if(x)
2443 {
2444 if(z)
2445 {
2446 if(y)
2447 {
2448 // full load/store width
2449 }
2450 else
2451 {
2452 // no load/store
2453 }
2454 }
2455 else
2456 {
2457 // no load/store
2458 }
2459 }
2460 else
2461 {
2462 if(z)
2463 {
2464 if(y)
2465 {
2466 // partial load/store width
2467 }
2468 else
2469 {
2470 // no load/store
2471 }
2472 }
2473 else
2474 {
2475 // no load/store
2476 }
2477 }
2478 */
2479 }
2480
2481 void write(const std::pair<int32_t, std::string>& y) override
2482 {
2483 int32_t idx_y = y.first;
2484 std::string coord_y = y.second;
2485
2486 // The only check required is on Y.
2487 out_of_bound_initialize_y(coord_y);
2488
2489 const std::string dst = _dst->vector(idx_y).str;
2490 const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
2491 const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address);
2492
2493 _writer->write_text(ls_buf);
2494 _writer->write_text(";\n");
2495
2496 out_of_bound_finalize_y(dst);
2497
2498 // The left over load/store will be written in the finalize stage
2499 if(_ls_width_part.size() != 0)
2500 {
2501 int32_t w = 0;
2502 for(auto &p : _ls_width_part)
2503 {
2504 const std::string dst0 = _dst->vector(w, p, idx_y).str;
2505 const std::string coord_x = _coord_x + " + " + std::to_string(w);
2506 const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
2507 const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
2508 _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
2509
2510 w += p;
2511 }
2512 }
2513 }
2514
2515 void finalize() override
2516 {
2517 out_of_bound_finalize_z();
2518 out_of_bound_finalize_x();
2519 }
2520private:
2521 IVectorTile* _dst { nullptr };
2522 int32_t _ls_width_full { 0 };
2523 std::vector<int32_t> _ls_width_part { };
2524 std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
2525 std::string _coord_x {};
2526 std::string _coord_z {};
2527 std::string _coord_orig_z {};
2528 std::string _coord_b {};
2529
2530 void out_of_bound_initialize_x(std::string& coord)
2531 {
2532 if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
2533 {
2534 auto tensor_format = _mapper.tensor_argument()->format();
2535 auto shape = tensor_format.shape;
2536
2537 _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
2538 if(_ls_width_part.size() != 0)
2539 {
2540 _writer->write_text("if(" + coord + " > 0)\n");
2541 _writer->compound_statement_begin();
2542 }
2543 }
2544 };
2545
2546 void out_of_bound_finalize_x()
2547 {
2548 if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
2549 {
2550 if(_ls_width_part.size() != 0)
2551 {
2552 _writer->compound_statement_end();
2553 _writer->write_text("else\n");
2554 _writer->compound_statement_begin();
2555
2556 out_of_bound_initialize_z(_coord_orig_z);
2557 for(auto &i : _leftovers_x)
2558 {
2559 out_of_bound_initialize_y(i.first.second);
2560 _writer->write_text(i.second);
2561 _writer->write_text(";\n");
2562 out_of_bound_finalize_y(i.first.first);
2563 }
2564 out_of_bound_finalize_z();
2565 _writer->compound_statement_end();
2566 }
2567 }
2568 };
2569
2570 void out_of_bound_initialize_y(std::string& coord)
2571 {
2572 std::string max = "";
2573
2574 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2575
2576 switch(address_mode_y)
2577 {
2578 case TensorSamplerAddressModeY::Skip:
2579 case TensorSamplerAddressModeY::ClampToBorder:
2580 // NOTE: This line should not be moved outside of the switch statement.
2581 // The reason for that is because when we query the component, the component is marked as used
2582 // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
2583 // we should request the component only when used
2584 max = _mapper.tensor_component_y();
2585 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2586 _writer->compound_statement_begin();
2587 break;
2588 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2589 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2590 _writer->write_text("if(" + coord + " >= 0)\n");
2591 _writer->compound_statement_begin();
2592 break;
2593 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2594 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2595 max = _mapper.tensor_component_y();
2596 _writer->write_text("if(" + coord + " < " + max + ")\n");
2597 _writer->compound_statement_begin();
2598 break;
2599 case TensorSamplerAddressModeY::ClampToNearest:
2600 max = _mapper.tensor_component_y();
2601 coord = "clamp(" + coord + ", 0, " + max + " - 1)";
2602 break;
2603 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
2604 max = _mapper.tensor_component_y();
2605 coord = "min(" + coord + ", " + max + " - 1)";
2606 break;
2607 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
2608 coord = "max(" + coord + ", 0)";
2609 break;
2610 case TensorSamplerAddressModeY::None:
2611 break;
2612 default:
2613 std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
2614 assert(false);
2615 }
2616 };
2617
2618 void out_of_bound_finalize_y(const std::string& dst)
2619 {
2620 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2621
2622 switch(address_mode_y)
2623 {
2624 case TensorSamplerAddressModeY::ClampToBorder:
2625 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2626 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2627 case TensorSamplerAddressModeY::Skip:
2628 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2629 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2630 _writer->compound_statement_end();
2631 break;
2632
2633 default:
2634 assert(false);
2635 }
2636
2637 switch(address_mode_y)
2638 {
2639 case TensorSamplerAddressModeY::ClampToBorder:
2640 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2641 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2642 _writer->write_text("else\n");
2643 _writer->compound_statement_begin();
2644 _writer->write_text(dst);
2645 _writer->write_text(" = 0.0f;\n");
2646 _writer->compound_statement_end();
2647 break;
2648
2649 default:
2650 assert(false);
2651 }
2652 };
2653
2654 void out_of_bound_initialize_z(std::string& coord)
2655 {
2656 std::string max = "";
2657
2658 const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
2659
2660 switch(address_mode_z)
2661 {
2662 case TensorSamplerAddressModeZ::Skip:
2663 max = _mapper.tensor_component_z();
2664 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2665 _writer->compound_statement_begin();
2666 break;
2667 case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
2668 _writer->write_text("if(" + coord + " >= 0)\n");
2669 _writer->compound_statement_begin();
2670 break;
2671 case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
2672 max = _mapper.tensor_component_z();
2673 _writer->write_text("if(" + coord + " < " + max + ")\n");
2674 _writer->compound_statement_begin();
2675 break;
2676 case TensorSamplerAddressModeZ::ClampToNearest:
2677 max = _mapper.tensor_component_z();
2678 coord = "clamp(" + coord + ", 0, " + max + " - 1)";
2679 break;
2680 case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
2681 max = _mapper.tensor_component_z();
2682 coord = "min(" + coord + ", " + max + " - 1)";
2683 break;
2684 case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
2685 coord = "max(" + coord + ", 0)";
2686 break;
2687 case TensorSamplerAddressModeZ::None:
2688 break;
2689 default:
2690 std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
2691 assert(false);
2692 }
2693 };
2694
2695 void out_of_bound_finalize_z()
2696 {
2697 const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
2698
2699 switch(address_mode_z)
2700 {
2701 case TensorSamplerAddressModeZ::Skip:
2702 case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
2703 case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
2704 _writer->compound_statement_end();
2705 break;
2706
2707 default:
2708 assert(false);
2709 }
2710 };
2711
2712 std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
2713 {
2714 std::vector<int32_t> x;
2715
2716 switch(ls_leftover_vector_width)
2717 {
2718 case 0:
2719 break;
2720 case 1:
2721 case 2:
2722 case 3:
2723 case 4:
2724 case 8:
2725 case 16:
2726 x.push_back(ls_leftover_vector_width);
2727 break;
2728 case 5:
2729 x.push_back(4);
2730 x.push_back(1);
2731 break;
2732 case 6:
2733 x.push_back(4);
2734 x.push_back(2);
2735 break;
2736 case 7:
2737 x.push_back(4);
2738 x.push_back(3);
2739 break;
2740 case 9:
2741 x.push_back(8);
2742 x.push_back(1);
2743 break;
2744 case 10:
2745 x.push_back(8);
2746 x.push_back(2);
2747 break;
2748 case 11:
2749 x.push_back(8);
2750 x.push_back(3);
2751 break;
2752 case 12:
2753 x.push_back(8);
2754 x.push_back(4);
2755 break;
2756 case 13:
2757 x.push_back(8);
2758 x.push_back(4);
2759 x.push_back(1);
2760 break;
2761 case 14:
2762 x.push_back(8);
2763 x.push_back(4);
2764 x.push_back(2);
2765 break;
2766 case 15:
2767 x.push_back(8);
2768 x.push_back(4);
2769 x.push_back(3);
2770 break;
2771
2772 default:
2773 assert(false);
2774 }
2775 return x;
2776 }
2777
2778 std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
2779 {
2780 switch(type)
2781 {
2782 case GpuLoadStoreType::Load:
2783 if(vector_width != 1)
2784 {
2785 return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
2786 }
2787 else
2788 {
2789 return data + " = *(" + address + ")";
2790 }
2791 break;
2792 case GpuLoadStoreType::Store:
2793 if(vector_width != 1)
2794 {
2795 return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
2796 }
2797 else
2798 {
2799 return "*(" + address + ") = " + data;
2800 }
2801 break;
2802 default:
2803 std::cout << "Unsupported GpuLoadStoreType" << std::endl;
2804 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01002805 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01002806 }
2807 }
2808
2809 std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
2810 {
2811 auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
2812 assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
2813 const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage);
2814 const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
2815
2816 std::string address;
2817 address += "(__global ";
2818 address += dst_type;
2819 address += "*)(";
2820 address += ptr_buf;
2821 if(x != "0" && (_mapper.is_one_component_x() != true))
2822 {
2823 address += " + (";
2824 address += x + ") * sizeof(" + dst_type + ")";
2825 }
2826 if(y != "0" && (_mapper.is_one_component_y() != true))
2827 {
2828 const std::string stride_y = _mapper.tensor_component_stride_y();
2829 address += " + (";
2830 address += y + ")";
2831 address += " * ";
2832 address += stride_y;
2833 }
2834 if(z != "0" && (_mapper.is_one_component_z() != true))
2835 {
2836 const std::string stride_z = _mapper.tensor_component_stride_z();
2837 address += " + (";
2838 address += z + ")";
2839 address += " * ";
2840 address += stride_z;
2841 }
2842 if(b != "0" && (_mapper.is_one_component_batch() != true))
2843 {
2844 const std::string stride_b = _mapper.tensor_component_stride_batch();
2845 address += " + (";
2846 address += b + ")";
2847 address += " * ";
2848 address += stride_b;
2849 }
2850 address += ")";
2851 return address;
2852 }
2853};
2854
2855class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
2856{
2857public:
2858 static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile *dst)
2859 {
2860 CKW_UNUSED(x);
2861
2862 if(dst->format().w != 4)
2863 {
2864 return false;
2865 }
2866 if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
2867 {
2868 return false;
2869 }
2870 if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
2871 {
2872 return false;
2873 }
2874 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
2875 {
2876 return false;
2877 }
2878 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
2879 {
2880 return false;
2881 }
2882 if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
2883 {
2884 return false;
2885 }
2886 return true;
2887 /*
2888 - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
2889 - z: Only GpuSamplerAddressModeZ::None is supported
2890 */
2891 }
2892 ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
2893 {
2894 }
2895
2896 ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
2897 ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
2898
2899 void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
2900 {
2901 assert(validate(_writer, _mapper, _type, dst));
2902
2903 _dst = dst;
2904 _ls_width_full = dst->format().w;
2905 _coord_x = x->scalar(0, 0).str;
2906 _coord_z = z->scalar(0, 0).str;
2907 _coord_b = b->scalar(0, 0).str;
2908
2909 /*
2910 if(y)
2911 {
2912 // full load/store width
2913 }
2914 else
2915 {
2916 // no load/store
2917 }
2918 */
2919 }
2920
2921 void write(const std::pair<int32_t, std::string>& y) override
2922 {
2923 int32_t idx_y = y.first;
2924 std::string coord_y = y.second;
2925
2926 // The only check required is on Y.
2927 out_of_bound_initialize_y(coord_y);
2928
2929 const std::string dst = _dst->vector(idx_y).str;
2930 const std::string sampler = to_ls_image2d_sampler();
2931 const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
2932 const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
2933
2934 _writer->write_text(ls_buf);
2935 _writer->write_text(";\n");
2936
2937 out_of_bound_finalize_y(dst);
2938 }
2939
2940 void finalize() override
2941 {
2942 }
2943private:
2944 IVectorTile* _dst { nullptr };
2945 int32_t _ls_width_full { 0 };
2946 std::string _coord_x {};
2947 std::string _coord_z {};
2948 std::string _coord_b {};
2949
2950 void out_of_bound_initialize_y(std::string& coord)
2951 {
2952 std::string max = "";
2953
2954 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2955
2956 switch(address_mode_y)
2957 {
2958 case TensorSamplerAddressModeY::Skip:
2959 max = _mapper.tensor_component_y();
2960 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2961 _writer->compound_statement_begin();
2962 break;
2963 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2964 _writer->write_text("if(" + coord + " >= 0)\n");
2965 _writer->compound_statement_begin();
2966 break;
2967 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2968 max = _mapper.tensor_component_y();
2969 _writer->write_text("if(" + coord + " < " + max + ")\n");
2970 _writer->compound_statement_begin();
2971 break;
2972 case TensorSamplerAddressModeY::ClampToBorder:
2973 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2974 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2975 case TensorSamplerAddressModeY::ClampToNearest:
2976 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
2977 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
2978 case TensorSamplerAddressModeY::None:
2979 break;
2980 default:
2981 std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
2982 assert(false);
2983 }
2984 };
2985
2986 void out_of_bound_finalize_y(const std::string& dst)
2987 {
2988 CKW_UNUSED(dst);
2989
2990 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2991
2992 switch(address_mode_y)
2993 {
2994 case TensorSamplerAddressModeY::Skip:
2995 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2996 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2997 _writer->compound_statement_end();
2998 break;
2999
3000 default:
3001 assert(false);
3002 }
3003 };
3004
3005 std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
3006 {
3007 CKW_UNUSED(vector_width);
3008
3009 auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
3010 const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage);
3011 // const DataType dt = _dst->format().dt;
3012 const std::string post_fix = _dst->format().dt == DataType::Fp32? "f" : "h";
3013
3014 switch(type)
3015 {
3016 case GpuLoadStoreType::Load:
3017 return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
3018 break;
3019 case GpuLoadStoreType::Store:
3020 return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
3021 default:
3022 assert(false);
3023 std::cout << "Unsupported GpuLoadStoreType" << std::endl;
3024 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01003025 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01003026 }
3027 }
3028
3029 std::string to_ls_image2d_sampler() const
3030 {
3031 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
3032
3033 switch(address_mode_y)
3034 {
3035 case TensorSamplerAddressModeY::None:
3036 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
3037 case TensorSamplerAddressModeY::Skip:
3038 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
3039 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
3040 case TensorSamplerAddressModeY::ClampToBorder:
3041 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
3042 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
3043 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
3044 case TensorSamplerAddressModeY::ClampToNearest:
3045 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
3046 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
3047 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
3048 default:
3049 std::cout << "Unsupported address_mode_coord" << std::endl;
3050 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01003051 return "";
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01003052 }
3053 }
3054
3055 std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
3056 {
3057 std::string coord_x = "(" + x + ") >> 2";
3058 std::string coord_y = "(";
3059
3060 if(y != "0" && (_mapper.is_one_component_y() != true))
3061 {
3062 coord_y += y;
3063 }
3064 if(z != "0" && (_mapper.is_one_component_z() != true))
3065 {
3066 const std::string dim = _mapper.tensor_component_y();
3067 coord_y += " + (";
3068 coord_y += z + ")";
3069 coord_y += " * ";
3070 coord_y += dim;
3071 }
3072 if(b != "0" && (_mapper.is_one_component_batch() != true))
3073 {
3074 const std::string dim0 = _mapper.tensor_component_y();
3075 const std::string dim1 = _mapper.tensor_component_z();
3076 coord_y += " + (";
3077 coord_y += b + ")";
3078 coord_y += " * ";
3079 coord_y += dim0;
3080 coord_y += " * ";
3081 coord_y += dim1;
3082 }
3083 coord_y += ")";
3084 return "(int2)(" + coord_x + ", " + coord_y + ")";
3085 }
3086};
3087
3088/** IGpuLoadStoreHelperWriter factory class */
3089class ClLoadStoreHelperWriterFactory final
3090{
3091public:
3092 /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
3093 *
3094 *
3095 * @return IGpuLoadStoreHelperWriter
3096 */
3097 static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
3098 {
3099 const auto tensor_storage = mapper.gpu_sampler().storage;
3100 switch(tensor_storage)
3101 {
3102 case GpuSamplerTensorStorage::BufferUint8Ptr:
3103 return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
3104 case GpuSamplerTensorStorage::Image2dReadOnly:
3105 case GpuSamplerTensorStorage::Image2dWriteOnly:
3106 return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
3107 default:
3108 std::cout << "Unsupported Gpu tensor storage" << std::endl;
3109 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01003110 return nullptr;
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01003111 }
3112 }
3113};
3114
3115// This utility method needs to go in utils.h
3116inline bool is_tile_scalar(IVectorTile* x)
3117{
3118 return x->format().w == 1 && x->format().h == 1;
3119}
3120
3121class ClKernelWriter : public IGpuKernelWriter
3122{
3123public:
3124 ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
3125 {
3126 _data = x;
3127 _attr = attr;
3128 }
3129
3130 ClKernelWriter(const ClKernelWriter &) = default;
3131 ClKernelWriter &operator=(const ClKernelWriter &) = default;
3132
3133 // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
3134 // there are no conflicts or ambiguity in the code
3135 void set_IdSpace(int32_t id) override
3136 {
3137 _data->tiles.set_IdSpace(id);
3138 _data->arguments.set_IdSpace(id);
3139 }
3140
3141 void import_tile(const std::string& dst_name, const IVectorTile *src) override
3142 {
3143 _data->tiles.insert(dst_name, src);
3144 }
3145
3146 void declare_argument(const std::string& name, const TensorInfo& tensor) override
3147 {
3148 assert(_data->arguments[name] == nullptr);
3149 _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
3150 }
3151
3152 void declare_tile(const std::string& name, const TileInfo& format) override
3153 {
3154 assert(_data->tiles[name] == nullptr);
3155 _data->tiles.insert(name, format);
3156
3157 IVectorTile *x = _data->tiles[name];
3158
3159 for(auto &t : x->underlying_source_variables())
3160 {
3161 _data->code += t.type.str + " " + t.str + ";\n";
3162 }
3163 }
3164
3165 void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
3166 {
3167 assert(_data->tiles[name] == nullptr);
3168 _data->tiles.insert(name, in, dt);
3169 // Note: A constant does not need to be declared in the code
3170 }
3171
3172 void write_text(const std::string& x) override
3173 {
3174 _data->code += x;
3175 }
3176
3177 void compound_statement_begin() override
3178 {
3179 _data->tiles.increment_registry_level();
3180 _data->code += "{\n";
3181 }
3182
3183 void compound_statement_end() override
3184 {
3185 _data->tiles.decrement_registry_level();
3186 _data->code += "}\n";
3187 }
3188
3189 void op_get_global_id(const Operand& dst_var, int32_t dim) override
3190 {
3191 assert(dst_var.type() == OperandType::Tile);
3192 assert(_data->tiles.has_tile(dst_var.value()));
3193 assert(_data->tiles[dst_var.value()]->format().w == 1 &&
3194 _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
3195
3196 auto var = _data->tiles[dst_var.value()];
3197
3198 _data->code += var->scalar(0, 0).str;
3199 _data->code += " = get_global_id(";
3200 _data->code += std::to_string(dim);
3201 _data->code += ");\n";
3202 };
3203
3204 void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
3205 {
3206 OperandUnpacker operands(_data->tiles, _data->arguments);
3207 auto dst = operands.unpack(o_dst);
3208 auto step = operands.unpack(o_step);
3209
3210 // Validation: Check that x, y and z are scalar
3211
3212 TensorOperandUnpacker tensor_operands(_data->arguments);
3213 auto tensor = tensor_operands.unpack(o_tensor);
3214 auto gpu_sampler = o_tensor.sampler();
3215
3216 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3217
3218 switch (dim)
3219 {
3220 case 0:
3221 if(mapper.is_one_component_x())
3222 {
3223 _data->code += dst->scalar(0, 0).str;
3224 _data->code += " = 0;\n";
3225 }
3226 else
3227 {
3228 if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
3229 {
3230 // Validation: Check: fixed tensor shape
3231 // TO BE CHANGED
3232 _data->code += dst->scalar(0, 0).str;
3233 _data->code += " = get_global_id(0) * ";
3234 _data->code += step->scalar(0, 0).str;
3235 _data->code += ";\n";
3236 }
3237 else
3238 {
3239 _data->code += dst->scalar(0, 0).str;
3240 _data->code += " = get_global_id(0) * ";
3241 _data->code += step->scalar(0, 0).str;
3242 _data->code += ";\n";
3243 }
3244 }
3245 break;
3246 case 1:
3247 if(mapper.is_one_component_y())
3248 {
3249 _data->code += dst->scalar(0, 0).str;
3250 _data->code += " = 0;\n";
3251 }
3252 else
3253 {
3254 if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
3255 {
3256
3257 }
3258 else
3259 {
3260 _data->code += dst->scalar(0, 0).str;
3261 _data->code += " = get_global_id(1) * ";
3262 _data->code += step->scalar(0, 0).str;
3263 _data->code += ";\n";
3264 }
3265 }
3266 break;
3267 case 2:
3268 if(mapper.is_one_component_z())
3269 {
3270 _data->code += dst->scalar(0, 0).str;
3271 _data->code += " = 0;\n";
3272 }
3273 else
3274 {
3275 _data->code += dst->scalar(0, 0).str;
3276 _data->code += " = get_global_id(2) * ";
3277 _data->code += step->scalar(0, 0).str;
3278 _data->code += ";\n";
3279 }
3280 break;
3281 default:
3282 break;
3283 }
3284 };
3285
3286 void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
3287 {
3288 OperandUnpacker operands(_data->tiles, _data->arguments);
3289 auto dst = operands.unpack(o_dst);
3290
3291 TensorOperandUnpacker tensor_operands(_data->arguments);
3292 auto tensor = tensor_operands.unpack(o_tensor);
3293 auto gpu_sampler = o_tensor.sampler();
3294
3295 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3296
3297 if(mapper.is_one_component_batch())
3298 {
3299 _data->code += dst->scalar(0, 0).str;
3300 _data->code += " = 0;\n";
3301 }
3302 else
3303 {
3304 std::cout << "Unsupported batched computation" << std::endl;
3305 assert(false);
3306 }
3307 };
3308
3309 void op_get_global_size(const Operand& dst_var, int32_t dim) override
3310 {
3311 assert(dst_var.type() == OperandType::Tile);
3312 assert(_data->tiles.has_tile(dst_var.value()));
3313 assert(_data->tiles[dst_var.value()]->format().w == 1 &&
3314 _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
3315
3316 auto var = _data->tiles[dst_var.value()];
3317
3318 _data->code += var->scalar(0, 0).str;
3319 _data->code += " = get_global_size(";
3320 _data->code += std::to_string(dim);
3321 _data->code += ");\n";
3322 }
3323
3324 void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
3325 {
3326 OperandUnpacker operands(_data->tiles, _data->arguments);
3327 auto lhs = operands.unpack(lhs_name);
3328 auto rhs = operands.unpack(rhs_name);
3329 auto dst = operands.unpack(dst_name);
3330
3331 const int32_t dst_w = dst->format().w;
3332 const int32_t dst_h = dst->format().h;
3333 assert(lhs != nullptr);
3334 const int32_t lhs_w = lhs->format().w;
3335 const int32_t rhs_w = rhs->format().w;
3336
3337 if(op == BinaryOp::MatMul_Nt_T)
3338 {
3339 assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
3340 for(int32_t y = 0; y < dst_h; ++y)
3341 {
3342 for(int32_t x = 0; x < dst_w; ++x)
3343 {
3344 for(int32_t k = 0; k < lhs_w; ++k)
3345 {
3346 _data->code += dst->scalar(x, y).str;
3347 _data->code += " = fma(";
3348 _data->code += lhs->scalar(k, y).str;
3349 _data->code += ", ";
3350 _data->code += rhs->scalar(k, x).str;
3351 _data->code += ", ";
3352 _data->code += dst->scalar(x, y).str;
3353 _data->code += ");\n";
3354 }
3355 }
3356 }
3357
3358 return;
3359 }
3360
3361 bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
3362 bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
3363
3364 std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
3365 std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
3366 std::string op_str = to_string(op);
3367
3368 // Broadcasting on Y is automatic
3369 for(int32_t y = 0; y < dst_h; ++y)
3370 {
3371 _data->code += dst->vector(y).str;
3372 _data->code += " = ";
3373 _data->code += lhs_prefix + lhs->vector(y).str;
3374 _data->code += " ";
3375 _data->code += op_str;
3376 _data->code += " ";
3377 _data->code += rhs_prefix + rhs->vector(y).str;
3378 _data->code += ";\n";
3379 }
3380 };
3381
3382 void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
3383 {
3384 CKW_UNUSED(policy);
3385
3386 OperandUnpacker operands(_data->tiles, _data->arguments);
3387 auto src = operands.unpack(o_src);
3388 auto dst = operands.unpack(o_dst);
3389
3390 // const int32_t dst_w = dst->format().w;
3391 const int32_t dst_h = dst->format().h;
3392 const std::string dt = dst->scalar(0, 0).type.str;
3393
3394 // Broadcasting on Y is automatic
3395 for(int32_t y = 0; y < dst_h; ++y)
3396 {
3397 _data->code += dst->vector(y).str;
3398 _data->code += " = convert_" + dt + "(";
3399 _data->code += src->vector(y).str;
3400 _data->code += ");\n";
3401 }
3402 };
3403
3404 void op_assign(const Operand& dst_name, const Operand& src_name) override
3405 {
3406 OperandUnpacker operands(_data->tiles, _data->arguments);
3407 auto src = operands.unpack(src_name);
3408 auto dst = operands.unpack(dst_name);
3409
3410 const int32_t dst_w = dst->format().w;
3411 const int32_t dst_h = dst->format().h;
3412 const int32_t src_w = src->format().w;
3413 // const int32_t src_h = src->format().h;
3414 const std::string dt = dst->scalar(0, 0).type.str;
3415
3416 bool broadcast_src_x = dst_w != 1 && src_w == 1;
3417
3418 std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
3419
3420 // Broadcasting on Y is automatic
3421 for(int32_t y = 0; y < dst_h; ++y)
3422 {
3423 _data->code += dst->vector(y).str;
3424 _data->code += " = ";
3425 _data->code += src_prefix + src->vector(y).str;
3426 _data->code += ";\n";
3427 }
3428 }
3429
3430 void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
3431 {
3432 OperandUnpacker operands(_data->tiles, _data->arguments);
3433 auto src = operands.unpack(src_name);
3434 auto dst = operands.unpack(dst_name);
3435
3436 const int32_t dst_w = dst->format().w;
3437 const int32_t dst_h = dst->format().h;
3438 const int32_t src_w = src->format().w;
3439 // const int32_t src_h = src->format().h;
3440 const std::string dt = dst->scalar(0, 0).type.str;
3441
3442 bool broadcast_src_x = dst_w != 1 && src_w == 1;
3443
3444 std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
3445
3446 // Broadcasting on Y is automatic
3447 for(int32_t y = 0; y < dst_h; ++y)
3448 {
3449 _data->code += dst->vector(y).str;
3450 _data->code += " = ";
3451
3452 switch(func)
3453 {
3454 case ScalarUnaryFunction::Exp:
3455 _data->code += "exp(";
3456 break;
3457
3458 default:
3459 CKW_ASSERT(false);
3460 }
3461
3462 _data->code += src_prefix + src->vector(y).str;
3463 _data->code += ");\n";
3464 }
3465 }
3466
3467 void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
3468 {
3469 OperandUnpacker operands(_data->tiles, _data->arguments);
3470 auto lhs = operands.unpack(o_lhs);
3471 auto rhs = operands.unpack(o_rhs);
3472
3473 assert(is_tile_scalar(lhs));
3474 assert(is_tile_scalar(rhs));
3475
3476 _data->code += "if(";
3477 _data->code += lhs->scalar(0, 0).str;
3478 _data->code += " ";
3479 _data->code += to_string(op);
3480 _data->code += " ";
3481 _data->code += rhs->scalar(0, 0).str;
3482 _data->code += ")\n";
3483 }
3484
3485 void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
3486 {
3487 OperandUnpacker operands(_data->tiles, _data->arguments);
3488 auto var = operands.unpack(var_name);
3489 auto cond_value = operands.unpack(cond_value_name);
3490 auto update_value = operands.unpack(update_value_name);
3491
3492 const int32_t dst_w = var->format().w;
3493 const int32_t dst_h = var->format().h;
3494
3495 // It must be a scalar variable
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01003496 CKW_UNUSED(dst_w, dst_h);
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01003497 assert(dst_w == 1);
3498 assert(dst_h == 1);
3499
3500 _data->code += "for(; " ;
3501 _data->code += var->scalar(0, 0).str;
3502 _data->code += " ";
3503 _data->code += to_string(cond_op);
3504 _data->code += " " + cond_value->scalar(0, 0).str + "; ";
3505 _data->code += var->scalar(0, 0).str;
3506 _data->code += " ";
3507 _data->code += to_string(update_op);
3508 _data->code += " " + update_value->scalar(0, 0).str + ")";
3509 _data->code += "\n";
3510 }
3511
3512 void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
3513 {
3514 OperandUnpacker operands(_data->tiles, _data->arguments);
3515 auto dst = operands.unpack(o_dst);
3516 auto x = operands.unpack(o_x);
3517 auto y = operands.unpack(o_y);
3518 auto z = operands.unpack(o_z);
3519 auto dil_y = operands.unpack(dilation_y);
3520 auto b = operands.unpack(o_batch_idx);
3521
3522 TensorOperandUnpacker tensor_operands(_data->arguments);
3523 auto tensor = tensor_operands.unpack(o_tensor);
3524 auto gpu_sampler = o_tensor.sampler();
3525
3526 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3527
3528 auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
3529
3530 // Initialize the constant part
3531 load_writer->initialize(dst, x, z, b);
3532
3533 for(int i = 0; i < dst->format().h; ++i)
3534 {
3535 std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
3536 if(dil_y->scalar(0, 0).str != "1")
3537 {
3538 coord_y += " * " + dil_y->scalar(0, 0).str;
3539 }
3540 load_writer->write(std::make_pair(i, coord_y));
3541 }
3542
3543 load_writer->finalize();
3544 }
3545
3546 void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
3547 {
3548 OperandUnpacker operands(_data->tiles, _data->arguments);
3549 auto dst = operands.unpack(o_dst);
3550 auto x = operands.unpack(o_x);
3551 auto y_ind = operands.unpack(o_indirect_h);
3552 auto z = operands.unpack(o_z);
3553 auto b = operands.unpack(o_batch_idx);
3554
3555 TensorOperandUnpacker tensor_operands(_data->arguments);
3556 auto tensor = tensor_operands.unpack(o_tensor);
3557 auto gpu_sampler = o_tensor.sampler();
3558
3559 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3560
3561 auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
3562
3563 // Initialize the constant part
3564 load_writer->initialize(dst, x, z, b);
3565
3566 for(int i = 0; i < dst->format().h; ++i)
3567 {
3568 load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
3569 }
3570
3571 load_writer->finalize();
3572 }
3573
3574 void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
3575 {
3576 OperandUnpacker operands(_data->tiles, _data->arguments);
3577 auto src = operands.unpack(src_name);
3578 auto x = operands.unpack(x_name);
3579 auto y = operands.unpack(y_name);
3580 auto z = operands.unpack(z_name);
3581 auto b = operands.unpack(batch_index_name);
3582
3583 TensorOperandUnpacker tensor_operands(_data->arguments);
3584 auto tensor = tensor_operands.unpack(tensor_name);
3585 auto gpu_sampler = tensor_name.sampler();
3586
3587 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3588
3589 auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
3590
3591 // Initialize the constant part
3592 store_writer->initialize(src, x, z, b);
3593
3594 int32_t tile_h = src->format().h;
3595
3596 for(int m0 = tile_h - 1; m0 >= 0; m0--)
3597 {
3598 store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
3599 }
3600
3601 store_writer->finalize();
3602 }
3603
3604 void op_return() override
3605 {
3606 _data->code += "return;\n";
3607 }
3608
3609 void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
3610 {
3611 OperandUnpacker operands(_data->tiles, _data->arguments);
3612 auto dst = operands.unpack(o_dst);
3613 auto x = operands.unpack(o_x);
3614 auto y = operands.unpack(o_y);
3615 auto x_off = operands.unpack(o_x_off);
3616 auto y_off = operands.unpack(o_y_off);
3617
3618 TensorOperandUnpacker tensor_operands(_data->arguments);
3619 auto tensor = tensor_operands.unpack(o_tensor);
3620
3621 assert(dst->format().w == 1);
3622 assert(x->format().w == 1);
3623 assert(y->format().w == 1);
3624 assert(x_off->format().w == 1);
3625 assert(y_off->format().w == 1);
3626 assert(dst->format().dt == DataType::Int32);
3627 assert(x->format().dt == DataType::Int32);
3628 assert(y->format().dt == DataType::Int32);
3629 assert(x_off->format().dt == DataType::Int32);
3630 assert(y_off->format().dt == DataType::Int32);
3631
3632 const std::string width = tensor->component(TensorComponent::W);
3633 const std::string height = tensor->component(TensorComponent::H);
3634 const std::string wxh = tensor->component(TensorComponent::WxH);
3635 /*
3636 int x_s;
3637 int y_s;
3638 x_s = (xi_0 + x_k);
3639 y_s = (yi_0 + y_k);
3640 mi_0 = x_s + y_s * width + b * widthxheight;
3641 mi_0 = select(-1, mi_0, x_s >= 0);
3642 mi_0 = select(-1, mi_0, y_s >= 0);
3643 mi_0 = select(-1, mi_0, x_s < 128);
3644 mi_0 = select(-1, mi_0, y_s < 128);
3645 */
3646 compound_statement_begin();
3647 declare_tile("_x_s", TileInfo(DataType::Int32));
3648 declare_tile("_y_s", TileInfo(DataType::Int32));
3649 auto x_s = operands.unpack(Operand("_x_s"));
3650 auto y_s = operands.unpack(Operand("_y_s"));
3651 for(int i = 0; i < dst->format().h; ++i)
3652 {
3653 // x_s = (xi_0 + x_k);
3654 // y_s = (yi_0 + y_k);
3655 _data->code += x_s->scalar(0, i).str;
3656 _data->code += " = (";
3657 _data->code += x->scalar(0, i).str;
3658 _data->code += " + ";
3659 _data->code += x_off->scalar(0, i).str;
3660 _data->code += ");\n";
3661 _data->code += y_s->scalar(0, i).str;
3662 _data->code += " = (";
3663 _data->code += y->scalar(0, i).str;
3664 _data->code += " + ";
3665 _data->code += y_off->scalar(0, i).str;
3666 _data->code += ");\n";
3667 // mi_0 = x_s + y_s * width;
3668 _data->code += dst->scalar(0, i).str;
3669 _data->code += " = ";
3670 _data->code += x_s->scalar(0, i).str;
3671 _data->code += " + ";
3672 _data->code += y_s->scalar(0, i).str;
3673 _data->code += " * " + width + ";\n";
3674 // mi_0 = select(wxh, mi_0, x_s >= 0);
3675 _data->code += dst->scalar(0, i).str;
3676 _data->code += " = select(-1, ";
3677 _data->code += dst->scalar(0, i).str;
3678 _data->code += ", ";
3679 _data->code += x_s->scalar(0, i).str;
3680 _data->code += " >= 0);\n";
3681 // mi_0 = select(wxh, mi_0, y_s >= 0);
3682 _data->code += dst->scalar(0, i).str;
3683 _data->code += " = select(-1, ";
3684 _data->code += dst->scalar(0, i).str;
3685 _data->code += ", ";
3686 _data->code += y_s->scalar(0, i).str;
3687 _data->code += " >= 0);\n";
3688 // mi_0 = select(wxh, mi_0, x_s < width);
3689 _data->code += dst->scalar(0, i).str;
3690 _data->code += " = select(-1, ";
3691 _data->code += dst->scalar(0, i).str;
3692 _data->code += ", ";
3693 _data->code += x_s->scalar(0, i).str;
3694 _data->code += " < ";
3695 _data->code += width + ");\n";
3696 // mi_0 = select(wxh, mi_0, y_s < height);
3697 _data->code += dst->scalar(0, i).str;
3698 _data->code += " = select(-1, ";
3699 _data->code += dst->scalar(0, i).str;
3700 _data->code += ", ";
3701 _data->code += y_s->scalar(0, i).str;
3702 _data->code += " < ";
3703 _data->code += height + ");\n";
3704 }
3705 compound_statement_end();
3706 }
3707
3708private:
3709 GpuKernelWriterDataHolder* _data { nullptr };
3710 GpuKernelWriterAttribute * _attr { nullptr };
3711};
3712
3713/** IGpuKernelWriter factory class */
3714class GpuKernelWriterFactory final
3715{
3716public:
3717 /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
3718 *
3719 * @param[in] gpu GPU target
3720 *
3721 * @return IGpuKernelWriter
3722 */
3723 static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
3724 {
3725 switch(x->programming_language())
3726 {
3727 case GpuTargetLanguage::OpenCL:
3728 return std::make_unique<ClKernelWriter>(attr, x);
3729 default:
3730 std::cout << "Unsupported Gpu programming language" << std::endl;
3731 assert(false);
Viet-Hoa Doe1880f02023-06-28 10:25:35 +01003732 return nullptr;
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01003733 }
3734 }
3735};
3736
3737inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
3738{
3739 auto tensor = tensor_info_id->shape;
3740
3741 int32_t dim[3] = {0};
3742
3743 switch(tensor_format)
3744 {
3745 case TensorSamplerFormat::C_W_H:
3746 dim[0] = tensor[0];
3747 dim[1] = tensor[1];
3748 dim[2] = tensor[2];
3749 break;
3750 case TensorSamplerFormat::C_WH_1:
3751 dim[0] = tensor[0];
3752 dim[1] = tensor[1] * tensor[2];
3753 dim[2] = 1;
3754 break;
3755 default:
3756 std::cout << "Unsupported tensor format" << std::endl;
3757 assert(false);
3758 break;
3759 }
3760
3761 return std::min(step, dim[idx]);
3762}
3763
3764} // namespace prototype
3765} // namespace ckw
3766
3767#endif // CKW_SRC_PROTOTYPE_H