blob: f113a0bfbc4c26628ae679d9caea8306a5342045 [file] [log] [blame]
Viet-Hoa Dobd4f6b92023-05-30 09:34:32 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#ifndef CKW_SRC_PROTOTYPE_H
26#define CKW_SRC_PROTOTYPE_H
27
28#include <vector>
29#include <map>
30#include <string>
31#include <cstdint> // int32_t
32#include <iostream> // cout (to be removed)
33#include <cassert> // assert (to be removed)
34#include <unordered_map>
35#include <chrono>
36#include <cmath>
37#include <memory>
38#include <algorithm>
39#include <array>
40#include <stdexcept>
41
42#include "ckw/Types.h"
43#include "ckw/TensorInfo.h"
44#include "ckw/Error.h"
45
46namespace ckw
47{
48namespace prototype {
49
50// Dummy data structure for Size2D
51using Size2D = std::vector<int32_t>;
52
53// Dummy Status
54using Status = void;
55
56enum class ComponentType : int32_t
57{
58 Complex = 0,
59 Simple = 1,
60 Unfusable = 2
61};
62
63enum class GpuCompilationSpeed
64{
65 Fast = 0x00, // fast compilation may increase the latency of the network
66 Slow = 0x01 // slow compilation may decrease the latency of the network
67};
68
69enum class GpuExtensions
70{
71 Fp16,
72 Dot8,
73 Mmul,
74 FastMath
75};
76
77struct TensorInfo
78{
79 TensorShape shape { {0} };
80 DataType data_type { DataType::Unknown };
81 TensorDataLayout data_layout { TensorDataLayout::Nhwc };
82 int32_t id { -1 };
83};
84
85struct ComponentAttribute
86{
87 GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
88 bool overwrite_tile { true };
89};
90
91inline std::string data_type_to_cl_type(DataType dt)
92{
93 switch(dt)
94 {
95 case DataType::Fp32:
96 return "float";
97 case DataType::Fp16:
98 return "half";
99 case DataType::Int8:
100 return "char";
101 case DataType::Uint8:
102 return "uchar";
103 case DataType::Uint16:
104 return "ushort";
105 case DataType::Int16:
106 return "short";
107 case DataType::Uint32:
108 return "uint";
109 case DataType::Int32:
110 return "int";
111 case DataType::Bool:
112 return "bool";
113 default:
114 assert(false);
115 }
116}
117
118inline int32_t width_to_cl_vector_size(int32_t width)
119{
120 switch(width)
121 {
122 case 1:
123 return 1;
124 case 2:
125 return 2;
126 case 3:
127 return 3;
128 case 4:
129 return 4;
130 case 5:
131 case 6:
132 case 7:
133 case 8:
134 return 8;
135 case 9:
136 case 10:
137 case 11:
138 case 12:
139 case 13:
140 case 14:
141 case 15:
142 case 16:
143 return 16;
144 default:
145 assert(false);
146 }
147}
148
149inline std::string get_cl_data_type(DataType dt, int32_t width)
150{
151 std::string data_type;
152 int32_t w = width_to_cl_vector_size(width);
153 data_type += data_type_to_cl_type(dt);
154 if(w != 1)
155 {
156 data_type += std::to_string(w);
157 }
158 return data_type;
159}
160
161inline std::string to_opencl_store(int32_t vector_length)
162{
163 if(vector_length != 1)
164 {
165 return "vstore" + std::to_string(vector_length) + "(";
166 }
167 else
168 {
169 return "*(";
170 }
171}
172
173struct TileInfo
174{
175 TileInfo() {}
176 TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
177 TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
178 TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
179 DataType dt{ DataType::Unknown }; // Data type of the tile
180 int32_t w{ 0 }; // Width (i.e. c0 - portion of the channels)
181 int32_t h{ 0 }; // Height (i.e. s0 - portion of the spatial dimensions)
182};
183
184inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
185{
186 o << a.w << " x " << a.h;
187 return o;
188}
189
190struct DataTypeAsString
191{
192 std::string str { "" };
193 DataType dt { DataType::Unknown };
194 int32_t size { 1 };
195};
196
197struct ValueAsString
198{
199 std::string str { "" };
200 DataTypeAsString type { };
201};
202
203// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
204// A Tile is a collection of variables used to express a 2D data.
205class IScalarTile
206{
207public:
208 virtual ~IScalarTile() = default;
209 /** Method to get the scalar variable from a tile
210 * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
211 * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
212 *
213 * @return the scalar variable as a string
214 */
215 virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
216 /** Method to get the list of underlying variable names used by the tile
217 *
218 * @return the list of variable names
219 */
220 virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
221 /** Method to get the name of the tile.
222 *
223 * @return the name of the tile
224 */
225 std::string name() const
226 {
227 return _basename;
228 }
229 /** Method to get the tile format
230 *
231 * @return the format
232 */
233 TileInfo format() const
234 {
235 return _format;
236 }
237 /** Method to know whether the tile is assignable or not (constant)
238 *
239 * @return true if the tile is assignable
240 */
241 virtual bool is_assignable() const = 0;
242 /** Method to know whether the tile needs to be declared
243 *
244 * @return true if the tile needs to be declared in the code before being used
245 */
246 virtual bool need_declaration() const = 0;
247protected:
248 TileInfo _format { }; // Tile format
249 std::string _basename { "" }; // Tile name
250};
251
252// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
253// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
254class IVectorTile : public IScalarTile
255{
256public:
257 virtual ~IVectorTile() = default;
258 /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
259 * The user can query the list of supported width for the vectors through preferred_vector_sizes().
260 *
261 * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
262 *
263 * @return the vector variable as a string
264 */
265 virtual ValueAsString vector(int32_t y) const = 0;
266 /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
267 *
268 * @return the vector variable as a string
269 */
270 virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
271 /** Method to get the preferred vector sizes.
272 *
273 * @return a vector with the preferred vector sizes
274 */
275 //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
276};
277
278class ClTile : public IVectorTile
279{
280public:
281 ClTile(const std::string& name, TileInfo format)
282 {
283 _format = format;
284 _basename = name;
285 }
286
287 ValueAsString scalar(int32_t x, int32_t y) const override
288 {
289 x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
290 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
291
292 ValueAsString t;
293 t.str = build_variable_name(y);
294 t.type.str = get_cl_data_type(_format.dt, 1);
295 t.type.dt = _format.dt;
296 t.type.size = 1;
297
298 // Check required because if the width has only one element, we cannot use .s0
299 if(_format.w != 1)
300 {
301 // Automatic broadcasting
302 t.str += ".s" + std::to_string(x);
303 }
304
305 return t;
306 }
307
308 ValueAsString vector(int32_t y) const override
309 {
310 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
311
312 ValueAsString t;
313 t.str = build_variable_name(y);
314 t.type.str = get_cl_data_type(_format.dt, _format.w);
315 t.type.dt = _format.dt;
316 t.type.size = _format.w;
317 return t;
318 }
319
320 ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
321 {
322 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
323
324 ValueAsString t;
325 t.str = build_variable_name(y);
326 t.type.str = get_cl_data_type(_format.dt, width);
327 t.type.dt = _format.dt;
328 t.type.size = width;
329
330 if(_format.w != 1)
331 {
332 t.str += ".s";
333 for(int i = 0; i < width; ++i)
334 {
335 t.str += to_scalar_hex(x_start + i);
336 }
337 }
338 return t;
339 }
340
341 std::vector<ValueAsString> underlying_source_variables() const override
342 {
343 std::vector<ValueAsString> vars;
344 for(int32_t y = 0; y < _format.h; ++y)
345 {
346 ValueAsString t;
347 t.str = build_variable_name(y);
348 t.type.str = get_cl_data_type(_format.dt, _format.w);
349 t.type.dt = _format.dt;
350 t.type.size = _format.w;
351 vars.push_back(t);
352 }
353 return vars;
354 }
355
356 bool is_assignable() const override
357 {
358 return true;
359 }
360
361 bool need_declaration() const override
362 {
363 return true;
364 }
365
366private:
367 std::string build_variable_name(int32_t y) const
368 {
369 std::string var_name = _basename;
370
371 if(_format.h == 1)
372 {
373 return var_name;
374
375 }
376 else
377 {
378 var_name += "_";
379 var_name += std::to_string(y);
380 }
381
382 return var_name;
383 }
384
385 std::string to_scalar_hex(int32_t x) const
386 {
387 switch(x)
388 {
389 case 0:
390 case 1:
391 case 2:
392 case 3:
393 case 4:
394 case 5:
395 case 6:
396 case 7:
397 case 8:
398 case 9:
399 return std::to_string(x);
400 case 10:
401 return "A";
402 case 11:
403 return "B";
404 case 12:
405 return "C";
406 case 13:
407 return "D";
408 case 14:
409 return "E";
410 case 15:
411 return "F";
412 default:
413 std::cout << "Unsupported hexadecimal value" << std::endl;
414 assert(false);
415 }
416 }
417};
418
419// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
420class ClConstantTile : public IVectorTile
421{
422public:
423 ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
424 {
425 _format.w = in[0].size();
426 _format.h = in.size();
427 _format.dt = dt;
428
429 _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
430
431 for(int32_t y = 0; y < _format.h; ++y)
432 {
433 for(int32_t x = 0; x < _format.w; ++x)
434 {
435 _data[y][x] = in[y][x];
436 }
437 }
438 }
439
440 ValueAsString scalar(int32_t x, int32_t y) const override
441 {
442 x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
443 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
444
445 ValueAsString t;
446 t.str = _data[y][x];
447 t.type.str = get_cl_data_type(_format.dt, 1);
448 t.type.dt = _format.dt;
449 t.type.size = 1;
450
451 return t;
452 }
453
454 ValueAsString vector(int32_t y) const override
455 {
456 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
457
458 return vector(0, _format.w, y);
459 }
460
461 ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
462 {
463 y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
464
465 ValueAsString t;
466 t.str = "";
467 t.type.str = get_cl_data_type(_format.dt, width);
468 t.type.dt = _format.dt;
469 t.type.size = width;
470
471 if(width > 1)
472 {
473 t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
474 }
475
476 int32_t x = x_start;
477 for(; x < width - 1; ++x)
478 {
479 t.str += scalar(x, y).str;
480 t.str += ", ";
481 }
482 t.str += scalar(x, y).str;
483
484 if(width > 1)
485 {
486 t.str += "))";
487 }
488
489 return t;
490 }
491
492 std::vector<ValueAsString> underlying_source_variables() const override
493 {
494 std::vector<ValueAsString> vars;
495
496 for(int32_t y = 0; y < _format.h; ++y)
497 {
498 for(int32_t x = 0; x < _format.w; ++x)
499 {
500 ValueAsString t;
501 t.str = _data[y][x];
502 t.type.str = get_cl_data_type(_format.dt, 1);
503 t.type.dt = _format.dt;
504 t.type.size = 1;
505 vars.push_back(t);
506 }
507 }
508
509 return vars;
510 }
511
512 bool is_assignable() const override
513 {
514 return false;
515 }
516
517 bool need_declaration() const override
518 {
519 return false;
520 }
521
522private:
523 std::vector<std::vector<std::string>> _data{};
524};
525
526enum class TensorComponentIndex : int32_t
527{
528 IndexMask = 0x0000000f,
529};
530
531enum class TensorComponentType : int32_t
532{
533 OffsetFirstElement = 0x00000100,
534 Stride = 0x00001000,
535 Dimension = 0x00010000,
536 FoldedDimension = 0x00100000,
537 Constant = 0x01000000
538};
539
540enum class TensorComponent : int32_t
541{
542 Unknown = 0x00000000,
543 OffsetFirstElement = 0x00000100,
544 Stride1 = 0x00001001,
545 Stride2 = 0x00001002,
546 Stride3 = 0x00001003,
547 Stride4 = 0x00001004,
548 Dim0 = 0x00010000,
549 Dim1 = 0x00010001,
550 Dim2 = 0x00010002,
551 Dim3 = 0x00010003,
552 Dim4 = 0x00010004,
553 C = 0x00010000, // Dim0
554 W = 0x00010001, // Dim1
555 H = 0x00010002, // Dim2
556 D = 0x00010003,
557 N = 0x00010004,
558 Dim1xDim2 = 0x00100021,
559 Dim1xDim2xDim3 = 0x00100321,
560 WxH = 0x00100021,
561 WxHxD = 0x00100321
562};
563
564inline std::string to_string(TensorComponent x)
565{
566 switch(x)
567 {
568 case TensorComponent::Unknown:
569 return "Unknown";
570 case TensorComponent::OffsetFirstElement:
571 return "OffsetFirstElement";
572 case TensorComponent::Stride1:
573 return "Stride1";
574 case TensorComponent::Stride2:
575 return "Stride2";
576 case TensorComponent::Stride3:
577 return "Stride3";
578 case TensorComponent::Stride4:
579 return "Stride4";
580 case TensorComponent::Dim0:
581 return "Dim0";
582 case TensorComponent::Dim1:
583 return "Dim1";
584 case TensorComponent::Dim2:
585 return "Dim2";
586 case TensorComponent::Dim3:
587 return "Dim3";
588 case TensorComponent::Dim4:
589 return "Dim4";
590 case TensorComponent::Dim1xDim2:
591 return "Dim1xDim2";
592 case TensorComponent::Dim1xDim2xDim3:
593 return "Dim1xDim2xDim3";
594 default:
595 assert(false);
596 }
597}
598
599class ITensorArgument
600{
601public:
602 virtual ~ITensorArgument() = default;
603 /** Method to get the tensor component as a string
604 *
605 * @param[in] x tensor component to query
606 *
607 * @return the tensor component as a string
608 */
609 virtual std::string component(TensorComponent x) = 0;
610 /** Method to get the tensor component type declaration as a string
611 *
612 * @return the tensor component type declaration as a string
613 */
614 virtual std::string component_type_declaration() const = 0;
615 /** Method to get the tensor component data type
616 *
617 * @return the tensor component data type
618 */
619 virtual DataType component_data_type() const = 0;
620 /** Method to get the tensor component declarations
621 *
622 * @return a vector containing the tensor component declarations
623 */
624 virtual std::vector<TensorComponent> component_declarations() const = 0;
625 /** Method to get the name of the tensor argument.
626 *
627 * @return the name of the tensor argument
628 */
629 std::string name() const
630 {
631 return _basename;
632 }
633 /** Method to get the tensor format
634 *
635 * @return the format
636 */
637 TensorInfo format() const
638 {
639 return _format;
640 }
641
642protected:
643 TensorInfo _format { };
644 std::string _basename {};
645};
646
647enum class GpuTensorStorage : int32_t
648{
649 Unknown = 0x0000,
650 BufferUint8Ptr = 0x0012,
651 Image2dReadOnly = 0x0020,
652 Image2dWriteOnly = 0x0021,
653 Image3dReadOnly = 0x0030,
654 Image3dWriteOnly = 0x0031
655};
656
657class IGpuTensorArgument : public ITensorArgument
658{
659public:
660 virtual ~IGpuTensorArgument() = default;
661 /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
662 *
663 * @param[in] x tensor storage to query
664 *
665 * @return the tensor storage as a string
666 */
667 virtual std::string storage(GpuTensorStorage x) = 0;
668 /** Method to get the tensor storage type declaration as a string
669 *
670 * @param[in] x tensor component to query
671 *
672 * @return the tensor storage type declaration as a string
673 */
674 virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
675 /** Method to get the tensor storage declarations
676 *
677 * @return a vector containing the tensor storage declarations
678 */
679 virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
680};
681
682class ClTensorArgument : public IGpuTensorArgument
683{
684public:
685 ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
686 {
687 _basename = name;
688 _format = x;
689 _return_by_value_when_possible = return_by_value_when_possible;
690 }
691
692 // Methods to override
693 std::string component(TensorComponent x) override
694 {
695 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
696 {
697 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
698 return std::to_string(idx - 1);
699 }
700
701 if(_return_by_value_when_possible)
702 {
703 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
704 {
705 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
706 return std::to_string(_format.shape[idx]);
707 }
708
709 if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
710 {
711 switch(x)
712 {
713 case TensorComponent::Dim1xDim2:
714 return std::to_string(_format.shape[1] * _format.shape[2]);
715 case TensorComponent::Dim1xDim2xDim3:
716 return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
717 default:
718 std::cout << "Unsupported folded dimension" << std::endl;
719 assert(false);
720 }
721 }
722 }
723
724 if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
725 {
726 _components_required.push_back(x);
727 }
728
729 return build_component_name(x);
730 }
731
732 std::string component_type_declaration() const override
733 {
734 return "int";
735 };
736
737 DataType component_data_type() const override
738 {
739 return DataType::Int32;
740 }
741
742 std::string storage(GpuTensorStorage x) override
743 {
744 if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
745 {
746 _storage_required.push_back(x);
747 }
748
749 return build_storage_name(x);
750 }
751
752 std::string storage_type_declaration(GpuTensorStorage x) const override
753 {
754 switch(x)
755 {
756 case GpuTensorStorage::BufferUint8Ptr:
757 return "__global uchar*";
758 case GpuTensorStorage::Image2dReadOnly:
759 return "__read_only image2d_t";
760 case GpuTensorStorage::Image2dWriteOnly:
761 return "__write_only image2d_t";
762 case GpuTensorStorage::Image3dReadOnly:
763 return "__read_only image3d_t ";
764 case GpuTensorStorage::Image3dWriteOnly:
765 return "__write_only image3d_t ";
766 default:
767 std::cout << "Unsupported storage" << std::endl;
768 assert(false);
769 }
770 };
771
772 std::vector<GpuTensorStorage> storage_declarations() const override
773 {
774 return _storage_required;
775 }
776
777 std::vector<TensorComponent> component_declarations() const override
778 {
779 return _components_required;
780 }
781
782private:
783 std::string build_storage_name(GpuTensorStorage x) const
784 {
785 std::string var_name = _basename;
786
787 switch(x)
788 {
789 case GpuTensorStorage::BufferUint8Ptr:
790 return var_name + "_ptr";
791 case GpuTensorStorage::Image2dReadOnly:
792 case GpuTensorStorage::Image2dWriteOnly:
793 return var_name + "_img2d";
794 case GpuTensorStorage::Image3dReadOnly:
795 case GpuTensorStorage::Image3dWriteOnly:
796 return var_name + "_img3d";
797 default:
798 std::cout << "Unsupported storage" << std::endl;
799 assert(false);
800 }
801
802 return var_name;
803 }
804
805 std::string build_component_name(TensorComponent x) const
806 {
807 std::string var_name = _basename;
808
809 switch(x)
810 {
811 case TensorComponent::OffsetFirstElement:
812 return var_name + "_offset_first_element";
813 case TensorComponent::Stride1:
814 return var_name + "_stride1";
815 case TensorComponent::Stride2:
816 return var_name + "_stride2";
817 case TensorComponent::Stride3:
818 return var_name + "_stride3";
819 case TensorComponent::Dim0:
820 return var_name + "_dim0";
821 case TensorComponent::Dim1:
822 return var_name + "_dim1";
823 case TensorComponent::Dim2:
824 return var_name + "_dim2";
825 case TensorComponent::Dim3:
826 return var_name + "_dim3";
827 case TensorComponent::Dim1xDim2:
828 return var_name + "_dim1xdim2";
829 case TensorComponent::Dim1xDim2xDim3:
830 return var_name + "_dim1xdim2xdim3";
831 default:
832 std::cout << "Unsupported component" << std::endl;
833 assert(false);
834 }
835
836 return var_name;
837 }
838
839 bool _return_by_value_when_possible { false };
840 std::vector<GpuTensorStorage> _storage_required {};
841 std::vector<TensorComponent> _components_required {};
842};
843
844/**
845 * @brief Data structure that contains the declared tiles by the components.
846 * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
847 * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
848 * and remove (pop) all the tiles from the level above.
849 * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
850 * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
851 * when declaring tiles among different components.
852 *
853 */
854class GpuTileRegistry
855{
856public:
857enum class RegistryTileType
858{
859 Tile,
860 Link
861};
862
863using RegistryIdSpace = int32_t;
864using RegistryLevel = int32_t;
865using RegistryTileName = std::string;
866
867struct RegistryTileTableEntry
868{
869 RegistryLevel registry_level { 0 };
870 std::unique_ptr<IVectorTile> tile_object { nullptr };
871};
872
873struct RegistryTileTypeTableEntry
874{
875 RegistryTileType tile_type { RegistryTileType::Tile };
876 RegistryTileName tile_name {};
877 RegistryIdSpace registry_idspace { 0 };
878 RegistryLevel registry_level { 0 };
879};
880
881using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
882using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
883 /**
884 * @brief Construct a new Gpu Tile Registry object
885 *
886 */
887 GpuTileRegistry()
888 {
889 _language = GpuTargetLanguage::Unknown;
890 }
891 /**
892 * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
893 *
894 * @param[in] language Gpu programming language to use
895 */
896 GpuTileRegistry(GpuTargetLanguage language)
897 {
898 _language = language;
899 }
900 /**
901 * @brief Default destructor. Destroy the Gpu Tile Registry object
902 *
903 */
904 ~GpuTileRegistry() = default;
905 /**
906 * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
907 * Therefore, the IdSpace should be set before declaring any tiles.
908 *
909 * @param[in] id The IdSpace id
910 */
911 void set_IdSpace(int32_t id)
912 {
913 _IdSpace = id;
914 }
915 /**
916 * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
917 *
918 * @return The IdSpace id
919 */
920 int32_t IdSpace() const
921 {
922 return _IdSpace;
923 }
924 /**
925 * @brief Gets all the IdSpace declarations defined in the tile registry.
926 *
927 * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
928 */
929 std::vector<int32_t> IdSpace_declarations() const
930 {
931 std::vector<int32_t> x;
932
933 auto it = _frags.begin();
934
935 while (it != _frags.end())
936 {
937 x.push_back(it->first);
938
939 it++;
940 }
941
942 return x;
943 }
944 /**
945 * @brief Declare a tile from a previously created tile
946 */
947 void insert(const std::string& name, const IVectorTile *frag)
948 {
949 assert(_language == GpuTargetLanguage::OpenCL);
950 const int32_t key_IdSpace = _IdSpace;
951 const std::string key_var_name = name;
952 const std::string var_name = frag->name();
953 TileInfo format = frag->format();
954
955 // First check whether a tile with the same name exists
956 IVectorTile *result = (*this)[key_var_name];
957 assert(result == nullptr);
958 if(result == nullptr)
959 {
960 std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
961
962 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
963 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
964
965 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link;
966 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
967 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
968 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
969 }
970 }
971 /**
972 * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
973 *
974 * @note The reference name used for declaring the tile should not be previously used in the IdSpace
975 *
976 * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
977 * @param[in] format Tile format use to use
978 */
979 void insert(const std::string& name, const TileInfo& format)
980 {
981 assert(_language == GpuTargetLanguage::OpenCL);
982 const int32_t key_IdSpace = _IdSpace;
983 const std::string key_var_name = name;
984 const std::string var_name = generate_tile_name(name);
985
986 // First check whether a tile with the same name exists
987 IVectorTile *result = (*this)[key_var_name];
988 assert(result == nullptr);
989 if(result == nullptr)
990 {
991 std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
992 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
993 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
994
995 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
996 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
997 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
998 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
999 }
1000 }
1001 /**
1002 * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
1003 *
1004 * @note The reference name used for declaring the tile should not be previously used in the IdSpace
1005 *
1006 * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
1007 * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
1008 * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
1009 * that the data type is aligned with the content of the std::string.
1010 */
1011 void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
1012 {
1013 assert(_language == GpuTargetLanguage::OpenCL);
1014 const int32_t key_IdSpace = _IdSpace;
1015 const std::string key_var_name = name;
1016
1017 // First check whether a tile with the same name exists
1018 IVectorTile *result = (*this)[key_var_name];
1019 assert(result == nullptr);
1020 if(result == nullptr)
1021 {
1022 std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
1023 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
1024 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
1025
1026 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
1027 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
1028 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
1029 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
1030 }
1031 }
1032 /**
1033 * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
1034 *
1035 * @note This method can be used to declare temporary tiles that need to be accessed only once.
1036 *
1037 * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
1038 * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
1039 * that the data type is aligned with what passed with the std::string.
1040 *
1041 * @return IVectorTile* the anonymous constant tile
1042 */
1043 IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
1044 {
1045 assert(_language == GpuTargetLanguage::OpenCL);
1046 const int32_t key_IdSpace = _IdSpace;
1047 const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
1048
1049 // First check whether a tile with the same name exists
1050 IVectorTile *result = (*this)[key_var_name];
1051 assert(result == nullptr);
1052 if(result == nullptr)
1053 {
1054 std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
1055 _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
1056 _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
1057
1058 _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
1059 _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
1060 _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
1061 _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
1062 }
1063
1064 return (*this)[key_var_name];
1065 }
1066 /**
1067 * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
1068 *
1069 * @param[in] name The name of the tile to retrieve
1070 * @param[in] IdSpace The IdSpace id where to search the tile
1071 *
1072 * @return IVectorTile* The tile
1073 */
1074 IVectorTile* get(const std::string& name, int32_t IdSpace)
1075 {
1076 const int32_t key_IdSpace = IdSpace;
1077 const std::string key_var_name = name;
1078
1079 IVectorTile* result = nullptr;
1080 auto search_IdSpace = _frags.find(key_IdSpace);
1081 if(search_IdSpace != _frags.end())
1082 {
1083 auto search_tile = _frags[key_IdSpace].find(key_var_name);
1084 if(search_tile != _frags[key_IdSpace].end())
1085 {
1086 result = search_tile->second.tile_object.get();
1087 assert(result != nullptr);
1088 }
1089 }
1090
1091 return result;
1092 }
1093 /**
1094 * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
1095 *
1096 * @param[in] name The name of the tile to retrieve
1097 *
1098 * @return IVectorTile* The tile
1099 */
1100 IVectorTile* operator[](const std::string& name)
1101 {
1102 return get(name, _IdSpace);
1103 }
1104 /**
1105 * @brief Check whether the tile in the in the IdSpace provided by the user exists
1106 *
1107 * @param[in] name Name of the tile to search for
1108 * @param[in] IdSpace The IdSpace id where to search the tile
1109 *
1110 * @return true if the tile exists
1111 * @return false if the tile does not exist
1112 */
1113 bool has_tile(const std::string& name, int32_t IdSpace) const
1114 {
1115 const int32_t key_IdSpace = IdSpace;
1116 const std::string key_var_name = name;
1117
1118 // IVectorTile* result = nullptr;
1119 auto search_IdSpace = _frags.find(key_IdSpace);
1120
1121 return search_IdSpace != _frags.end();
1122 }
1123 /**
1124 * @brief Check whether the tile within the current IdSpace exists
1125 *
1126 * @param[in] name Name of the tile to search for
1127 *
1128 * @return true if the tile exists
1129 * @return false if the tile does not exist
1130 */
1131 bool has_tile(const std::string& name) const
1132 {
1133 return has_tile(name, _IdSpace);
1134 }
1135 /**
1136 * @brief Get all the tiles declared within the IdSpace provided by the user
1137 *
1138 * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
1139 *
1140 * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
1141 */
1142 std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
1143 {
1144 std::vector<IVectorTile*> tiles;
1145
1146 std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
1147
1148 while (it != _frag_types[IdSpace].end())
1149 {
1150 // The following line should be enabled. However, we cannot at this stage
1151 // because it used to retrieve the output tile produced by each component.
1152 // However, this method should NOT be used to retrieve the output tile
1153 //if(it->second.tile_type == RegistryTileType::Tile)
1154 {
1155 tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
1156 }
1157 it++;
1158 }
1159
1160 return tiles;
1161 }
1162 /**
1163 * @brief Increase the level of stack.
1164 *
1165 */
1166 void increment_registry_level()
1167 {
1168 _registry_level++;
1169 }
1170 /**
1171 * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
1172 *
1173 */
1174 void decrement_registry_level()
1175 {
1176 assert(_registry_level >= 0);
1177
1178 // Remove all variables in the local scope
1179 std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
1180
1181 while (it != _frags[_IdSpace].end())
1182 {
1183 if (it->second.registry_level == _registry_level)
1184 {
1185 it = _frags[_IdSpace].erase(it);
1186 }
1187 else
1188 {
1189 it++;
1190 }
1191 }
1192
1193 std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
1194
1195 while (it_type != _frag_types[_IdSpace].end())
1196 {
1197 if (it_type->second.registry_level == _registry_level)
1198 {
1199 it_type = _frag_types[_IdSpace].erase(it_type);
1200 }
1201 else
1202 {
1203 it_type++;
1204 }
1205 }
1206
1207 _registry_level--;
1208 }
1209 /**
1210 * @brief Get the level of the stack
1211 *
1212 */
1213 int32_t level() const
1214 {
1215 return _registry_level;
1216 }
1217
1218private:
1219 // This method ensures that the key is unique among different components
1220 std::string generate_tile_name(const std::string& name)
1221 {
1222 assert(_IdSpace >= 0 );
1223 if(_registry_level == 0)
1224 {
1225 return "_G" + std::to_string(_IdSpace) + "_" + name;
1226 }
1227 else
1228 {
1229 return name;
1230 }
1231 }
1232 RegistryTileTable _frags {};
1233 RegistryTileTypeTable _frag_types {};
1234 RegistryLevel _registry_level { 0 };
1235 RegistryIdSpace _IdSpace { -1 };
1236 int32_t _anonymous_frag_count { 0 }; // Counter used to create the anonymous tiles
1237 GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
1238};
1239
1240using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
1241
1242/**
1243 * @brief Data structure that contains the tensors consumed by the components.
1244 * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
1245 * when declaring tensors among different components.
1246 *
1247 */
1248class GpuTensorArgumentRegistry
1249{
1250public:
1251 /**
1252 * @brief Construct a new Gpu Tensor Registry object
1253 *
1254 */
1255 GpuTensorArgumentRegistry()
1256 {
1257 _language = GpuTargetLanguage::Unknown;
1258 }
1259 /**
1260 * @brief Construct a new Gpu Tensor Registry object
1261 *
1262 * @param[in] language Gpu programming language to use
1263 */
1264 GpuTensorArgumentRegistry(GpuTargetLanguage language)
1265 {
1266 _language = language;
1267 }
1268 /**
1269 * @brief Default destructor. Destroy the Gpu Tensor Registry object
1270 *
1271 */
1272 ~GpuTensorArgumentRegistry() = default;
1273 /**
1274 * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
1275 * Therefore, the IdSpace should be set before declaring any tensors.
1276 *
1277 * @param[in] id The IdSpace id
1278 */
1279 void set_IdSpace(int32_t id)
1280 {
1281 _IdSpace = id;
1282 }
1283 /**
1284 * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
1285 *
1286 * @return The IdSpace id
1287 */
1288 int32_t IdSpace() const
1289 {
1290 return _IdSpace;
1291 }
1292 /**
1293 * @brief Gets all the IdSpace declarations defined in the tensor registry.
1294 *
1295 * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
1296 */
1297 std::vector<int32_t> IdSpace_declarations() const
1298 {
1299 std::vector<int32_t> x;
1300
1301 auto it = _refs.begin();
1302
1303 while (it != _refs.end())
1304 {
1305 x.push_back(it->first);
1306
1307 it++;
1308 }
1309
1310 return x;
1311 }
1312 /**
1313 * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
1314 *
1315 * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
1316 *
1317 * @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
1318 * @param[in] x Pair of tensor info and tensor id
1319 * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
1320 */
1321 void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
1322 {
1323 assert(_language == GpuTargetLanguage::OpenCL);
1324 const int32_t key_IdSpace = _IdSpace;
1325 const int32_t tensor_id = x.id;
1326 const std::string key_var_name = name;
1327 const std::string var_name = generate_tensor_name(name, tensor_id);
1328
1329 // First, check whether the tensor has already a reference. If so, trigger an assert
1330 assert(!has_tensor_argument(name));
1331
1332 // Check whether a tensor with that tensorID exists
1333 auto result = _tensor_arguments.find(tensor_id);
1334 if(result == _tensor_arguments.end())
1335 {
1336 // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
1337 std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
1338 _tensor_arguments[tensor_id] = std::move(arg);
1339 }
1340
1341 _refs[key_IdSpace][key_var_name] = tensor_id;
1342 }
1343 /**
1344 * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
1345 *
1346 * @param[in] name The name of the tensor to retrieve
1347 *
1348 * @return IGpuTensor* The tensor
1349 */
1350 IGpuTensorArgument* operator[](const std::string& name)
1351 {
1352 const int32_t key_IdSpace = _IdSpace;
1353 const std::string key_var_name = name;
1354
1355 IGpuTensorArgument* result = nullptr;
1356 auto search_IdSpace = _refs.find(key_IdSpace);
1357 if(search_IdSpace != _refs.end())
1358 {
1359 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1360
1361 if(search_tensor_id != _refs[key_IdSpace].end())
1362 {
1363 const int32_t tensor_id = search_tensor_id->second;
1364 auto search_tensor_argument = _tensor_arguments.find(tensor_id);
1365 if(search_tensor_argument != _tensor_arguments.end())
1366 {
1367 result = search_tensor_argument->second.get();
1368 }
1369 assert(result != nullptr);
1370 }
1371 }
1372
1373 return result;
1374 }
1375 /**
1376 * @brief Get all the tensors declared in the IdSpace provided by the user
1377 *
1378 * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
1379 */
1380 std::vector<IGpuTensorArgument*> tensor_argument_declarations()
1381 {
1382 std::vector<IGpuTensorArgument*> args;
1383
1384 auto it = _tensor_arguments.begin();
1385
1386 while (it != _tensor_arguments.end())
1387 {
1388 args.push_back(it->second.get());
1389 it++;
1390 }
1391
1392 return args;
1393 }
1394 /**
1395 * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
1396 *
1397 * @param[in] name Name of the tensor argument to search for
1398 *
1399 * @return true if the tensor argument exists
1400 * @return false if the tensor argument does not exist
1401 */
1402 bool has_tensor_argument(const std::string& name)
1403 {
1404 const int32_t key_IdSpace = _IdSpace;
1405 const std::string key_var_name = name;
1406
1407 auto search_IdSpace = _refs.find(key_IdSpace);
1408
1409 if(search_IdSpace != _refs.end())
1410 {
1411 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1412
1413 return search_tensor_id != _refs[key_IdSpace].end();
1414 }
1415 else
1416 {
1417 return false;
1418 }
1419 }
1420 /**
1421 * @brief Check whether the tensor argument is in the the IdSpace provided by the user
1422 *
1423 * @param[in] name Name of the tensor argument to search for
1424 * @param[in] IdSpace The IdSpace id where to search the tensor argument
1425 *
1426 * @return true if the tile exists
1427 * @return false if the tile does not exist
1428 */
1429 bool has_tensor_argument(const std::string& name, int32_t IdSpace)
1430 {
1431 const int32_t key_IdSpace = IdSpace;
1432 const std::string key_var_name = name;
1433
1434 auto search_IdSpace = _refs.find(key_IdSpace);
1435
1436 if(search_IdSpace != _refs.end())
1437 {
1438 auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
1439
1440 return search_tensor_id != _refs[key_IdSpace].end();
1441 }
1442 else
1443 {
1444 return false;
1445 }
1446 }
1447private:
1448 // This method ensures that the key is unique among different components
1449 std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
1450 {
1451 assert(tensor_id >= 0 );
1452
1453 return name + std::to_string(tensor_id);
1454 }
1455
1456 std::map<int32_t, TensorEntry> _tensor_arguments {};
1457 std::map<int32_t, std::map<std::string, int32_t>> _refs {};
1458 int32_t _IdSpace { -1 };
1459 GpuTargetLanguage _language { GpuTargetLanguage::Unknown }; // Gpu programming language
1460};
1461
1462enum class OpType : int32_t
1463{
1464 Elementwise = 0x0000,
1465 Relational = 0x1000,
1466 Algebra = 0x2000
1467};
1468
1469inline std::string to_string(AssignmentOp op)
1470{
1471 switch(op)
1472 {
1473 case AssignmentOp::Decrement:
1474 return "-=";
1475 case AssignmentOp::Increment:
1476 return "+=";
1477
1478 default:
1479 assert(false);
1480 }
1481}
1482
1483inline std::string to_string(BinaryOp op)
1484{
1485 switch(op)
1486 {
1487 case BinaryOp::Add:
1488 return "+";
1489 case BinaryOp::Sub:
1490 return "-";
1491 case BinaryOp::Mul:
1492 return "*";
1493 case BinaryOp::Div:
1494 return "/";
1495 case BinaryOp::Mod:
1496 return "%";
1497 case BinaryOp::Equal:
1498 return "==";
1499 case BinaryOp::Less:
1500 return "<";
1501 case BinaryOp::LessEqual:
1502 return "<=";
1503 case BinaryOp::Greater:
1504 return ">";
1505 case BinaryOp::GreaterEqual:
1506 return ">=";
1507 case BinaryOp::LogicalAnd:
1508 return "&&";
1509 case BinaryOp::LogicalOr:
1510 return "||";
1511 case BinaryOp::LogicalNot:
1512 return "!";
1513 default:
1514 assert(false);
1515 }
1516}
1517
1518inline std::string binary_op_string(BinaryOp op)
1519{
1520 switch(op)
1521 {
1522 case BinaryOp::Add:
1523 return "add";
1524 case BinaryOp::Sub:
1525 return "sub";
1526 case BinaryOp::Mul:
1527 return "mul";
1528 case BinaryOp::Div:
1529 return "div";
1530 case BinaryOp::Mod:
1531 return "mod";
1532 case BinaryOp::Equal:
1533 return "eq";
1534 case BinaryOp::Less:
1535 return "gt";
1536 case BinaryOp::LessEqual:
1537 return "gteq";
1538 case BinaryOp::Greater:
1539 return "lt";
1540 case BinaryOp::GreaterEqual:
1541 return "lte";
1542 default:
1543 assert(false);
1544 }
1545}
1546
1547enum class OperandType : int32_t
1548{
1549 Unknown = 0x00000000,
1550 ScalarFp32 = 0x00001011, // Immediate scalar tile
1551 ScalarFp16 = 0x00001012, // Immediate scalar tile
1552 ScalarInt32 = 0x00001021, // Immediate scalar tile
1553 ScalarInt16 = 0x00001022, // Immediate scalar tile
1554 ScalarInt8 = 0x00001024, // Immediate scalar tile
1555 ScalarUInt32 = 0x00001031, // Immediate scalar tile
1556 ScalarUInt16 = 0x00001032, // Immediate scalar tile
1557 ScalarUInt8 = 0x00001034, // Immediate scalar tile
1558 ScalarBool = 0x00001041, // Immediate scalar tile
1559 ScalarTile = 0x00001050, // Scalar from a tile
1560 Tile = 0x00010000, // Tile
1561 TensorStride1 = 0x00100001, // Tensor component
1562 TensorStride2 = 0x00100002, // Tensor component
1563 TensorStride3 = 0x00100003, // Tensor component
1564 TensorStride4 = 0x00100004, // Tensor component
1565 TensorDim0 = 0x00100010, // Tensor component
1566 TensorDim1 = 0x00100020, // Tensor component
1567 TensorDim2 = 0x00100030, // Tensor component
1568 TensorDim3 = 0x00100040, // Tensor component
1569 TensorDim4 = 0x00100050, // Tensor component
1570 TensorC = 0x00100010, // Tensor component
1571 TensorW = 0x00100020, // Tensor component
1572 TensorH = 0x00100030, // Tensor component
1573 TensorD = 0x00100040, // Tensor component
1574 TensorN = 0x00100050, // Tensor component
1575 TensorDim1xDim2 = 0x00100100, // Tensor component
1576 TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
1577 TensorWxH = 0x00100300, // Tensor component
1578 TensorWxHxD = 0x00100400, // Tensor component
1579 TensorDataOffset = 0x00100500, // Tensor component
1580};
1581
1582struct ScalarTileCoord
1583{
1584 ScalarTileCoord() {}
1585 ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
1586 int32_t x { -1 };
1587 int32_t y { -1 };
1588};
1589/**
1590 * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
1591 * Operand can be of three types:
1592 * -# Scalar immediate: constant expression
1593 * -# Tile: A tile
1594 * -# Tensor component: A component (scalar) of a tensor
1595 *
1596 */
1597class Operand
1598{
1599public:
1600 Operand(const std::string &val)
1601 {
1602 _str = val;
1603 _type = OperandType::Tile;
1604 }
1605
1606 Operand(const std::string &val, const ScalarTileCoord& coord)
1607 {
1608 _str = val;
1609 _type = OperandType::ScalarTile;
1610 _coord = coord;
1611 }
1612
1613 Operand(const std::string &val, OperandType type)
1614 {
1615 _str = val;
1616 _type = type;
1617 }
1618
1619 Operand(const Operand& t)
1620 {
1621 _str = t.value();
1622 _type = t.type();
1623 }
1624
1625 Operand& operator=(const Operand& t)
1626 {
1627 _str = t.value();
1628 _type = t.type();
1629 _coord = t.scalar_tile_coordinate();
1630 return *this;
1631 }
1632
1633 std::string value() const
1634 {
1635 return _str;
1636 }
1637
1638 OperandType type() const
1639 {
1640 return _type;
1641 }
1642
1643 ScalarTileCoord scalar_tile_coordinate() const
1644 {
1645 return _coord;
1646 }
1647
1648private:
1649 std::string _str {};
1650 OperandType _type { OperandType::Unknown };
1651 ScalarTileCoord _coord {};
1652};
1653
1654enum class GpuSamplerTensorStorage : int32_t
1655{
1656 Unknown = static_cast<int32_t>(GpuTensorStorage::Unknown),
1657 BufferUint8Ptr = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
1658 Image2dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
1659 Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
1660 Image3dReadOnly = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
1661 Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
1662};
1663
1664struct GpuSampler
1665{
1666 GpuSampler() = default;
1667 TensorSamplerFormat format { TensorSamplerFormat::Unknown };
1668 GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
1669 TensorSamplerAddressModeX address_mode_x { TensorSamplerAddressModeX::Unknown };
1670 TensorSamplerAddressModeY address_mode_y { TensorSamplerAddressModeY::Unknown };
1671 TensorSamplerAddressModeZ address_mode_z { TensorSamplerAddressModeZ::Unknown };
1672};
1673
1674inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
1675{
1676 auto tensor = tensor_info_id->shape;
1677
1678 GpuSampler dst_sampler;
1679 dst_sampler.format = sampler.format;
1680 dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr;
1681 dst_sampler.address_mode_x = sampler.address_mode_x;
1682 dst_sampler.address_mode_y = sampler.address_mode_y;
1683 dst_sampler.address_mode_z = sampler.address_mode_z;
1684
1685 int32_t dim_x = 0;
1686 int32_t dim_y = 0;
1687 int32_t dim_z = 0;
1688
1689 switch(sampler.format)
1690 {
1691 case TensorSamplerFormat::C_W_H:
1692 dim_x = tensor[0];
1693 dim_y = tensor[1];
1694 dim_z = tensor[2];
1695 break;
1696 case TensorSamplerFormat::C_WH_1:
1697 dim_x = tensor[0];
1698 dim_y = tensor[1] * tensor[2];
1699 dim_z = 1;
1700 break;
1701 default:
1702 std::cout << "Unsupported tensor format" << std::endl;
1703 assert(false);
1704 break;
1705 }
1706
1707 if(dim_x == 1)
1708 {
1709 assert(step_x == 1);
1710 dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
1711 }
1712
1713 if(dim_y == 1)
1714 {
1715 assert(step_y == 1);
1716 dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
1717 }
1718
1719 if(dim_z == 1)
1720 {
1721 assert(step_z == 1);
1722 dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
1723 }
1724
1725 return dst_sampler;
1726}
1727
1728class GpuOutputSampler
1729{
1730public:
1731 GpuOutputSampler() = default;
1732 /**
1733 * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
1734 * by the root component. Once initialized, all simpler components will need to used this sampler
1735 * or a broadcasted version of it
1736 *
1737 * @param[in] sampler GpuSampler
1738 * @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile!
1739 * @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
1740 * @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
1741 */
1742 void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
1743 {
1744 assert(_is_initialized == false);
1745
1746 _step_x = step_x;
1747 _step_y = step_y;
1748 _step_z = step_z;
1749 _tensor_info_id = tensor_info_id;
1750 _sampler = create_sampler(tensor_storage, tensor_format);
1751 _is_initialized = true;
1752 };
1753
1754 GpuSampler sampler() const
1755 {
1756 return _sampler;
1757 };
1758
1759 int32_t step_x() const
1760 {
1761 return _step_x;
1762 };
1763
1764 int32_t step_y() const
1765 {
1766 return _step_y;
1767 };
1768
1769 int32_t step_z() const
1770 {
1771 return _step_z;
1772 };
1773private:
1774 GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
1775 {
1776 // Output can only be in output mode
1777 assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
1778 assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
1779
1780 auto tensor = _tensor_info_id->shape;
1781
1782 GpuSampler sampler;
1783 sampler.format = tensor_format;
1784 sampler.storage = tensor_storage;
1785 sampler.address_mode_x = TensorSamplerAddressModeX::None;
1786 sampler.address_mode_y = TensorSamplerAddressModeY::None;
1787 sampler.address_mode_z = TensorSamplerAddressModeZ::None;
1788
1789 // In the case of texture, we do not need any special checks at the border
1790 if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
1791 {
1792 int32_t dim_x = 0;
1793 int32_t dim_y = 0;
1794 int32_t dim_z = 0;
1795
1796 switch(tensor_format)
1797 {
1798 case TensorSamplerFormat::C_W_H:
1799 dim_x = tensor[0];
1800 dim_y = tensor[1];
1801 dim_z = tensor[2];
1802 break;
1803 case TensorSamplerFormat::C_WH_1:
1804 dim_x = tensor[0];
1805 dim_y = tensor[1] * tensor[2];
1806 dim_z = 1;
1807 break;
1808 default:
1809 std::cout << "Unsupported tensor format" << std::endl;
1810 assert(false);
1811 break;
1812 }
1813
1814 if((dim_x % _step_x) != 0 && dim_x != 1)
1815 {
1816 sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
1817 }
1818
1819 if((dim_y % _step_y) != 0 && dim_y != 1)
1820 {
1821 sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
1822 }
1823
1824 if((dim_z % _step_z) != 0 && dim_z != 1)
1825 {
1826 sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
1827 }
1828 }
1829
1830 return sampler;
1831 }
1832 GpuSampler _sampler { }; // GpuSampler
1833 int32_t _step_x { 1 };
1834 int32_t _step_y { 1 };
1835 int32_t _step_z { 1 };
1836 const TensorInfo* _tensor_info_id { nullptr };
1837 bool _is_initialized { false };
1838};
1839
1840/**
1841 * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
1842 */
1843class TensorOperand
1844{
1845public:
1846 TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
1847 {
1848 }
1849
1850 TensorOperand& operator=(const TensorOperand& t)
1851 {
1852 _str = t.value();
1853 _sampler = t.sampler();
1854 return *this;
1855 }
1856
1857 std::string value() const
1858 {
1859 return _str;
1860 }
1861
1862 GpuSampler sampler() const
1863 {
1864 return _sampler;
1865 }
1866
1867private:
1868 std::string _str {};
1869 GpuSampler _sampler {};
1870};
1871
1872/**
1873 * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
1874 * This data structure must be initialized before being passed to the Gpu Kernel Writer
1875 *
1876 */
1877class GpuKernelWriterDataHolder
1878{
1879public:
1880 /**
1881 * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
1882 * the GPU target and target specific capabilities (extensions). For now, we just initialize the
1883 * programming language
1884 *
1885 * @param[in] language Gpu programming language to use
1886 */
1887 GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
1888 {
1889 }
1890 /**
1891 * @brief Get the Gpu programming language used
1892 *
1893 * @return GpuTargetLanguage the Gpu programming language
1894 */
1895 GpuTargetLanguage programming_language() const
1896 {
1897 return _language;
1898 }
1899 /**
1900 * @brief @ref GpuTileRegistry
1901 *
1902 */
1903 GpuTileRegistry tiles{};
1904 /**
1905 * @brief @ref GpuTensorArgumentRegistry
1906 *
1907 */
1908 GpuTensorArgumentRegistry arguments{};
1909 /**
1910 * @brief @ref GpuOutputSampler.
1911 *
1912 */
1913 GpuOutputSampler output_sampler{};
1914 /**
1915 * @brief Source code
1916 *
1917 */
1918 std::string code{};
1919
1920 // GpuExtensionRegistry extensions{};
1921private:
1922 GpuTargetLanguage _language;
1923};
1924
1925struct LWS
1926{
1927 int32_t x {1};
1928 int32_t y {1};
1929 int32_t z {1};
1930};
1931
1932/**
1933 * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
1934 * declare an anonymous tile in the tile registry.
1935 */
1936class OperandUnpacker
1937{
1938public:
1939 OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
1940 {
1941 // Increase the level of the stack to allocate possible temporary tiles
1942 _tiles.increment_registry_level();
1943 };
1944
1945 ~OperandUnpacker()
1946 {
1947 // Decrease the level of the stack to deallocate any temporary tiles
1948 _tiles.decrement_registry_level();
1949 }
1950
1951 IVectorTile* unpack(const Operand& src)
1952 {
1953 // Get the tile
1954 if(src.type() == OperandType::Tile)
1955 {
1956 assert(_tiles.has_tile(src.value()));
1957 return _tiles[src.value()];
1958 }
1959 // Create an anonymous tile with a constant
1960 else if(static_cast<int32_t>(src.type()) & 0x00001000)
1961 {
1962 if(src.type() == OperandType::ScalarTile)
1963 {
1964 ScalarTileCoord coord = src.scalar_tile_coordinate();
1965 assert(_tiles.has_tile(src.value()));
1966 assert(coord.x >= 0);
1967 assert(coord.y >= 0);
1968 auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
1969 return _tiles.insert({{{val.str}}}, val.type.dt);
1970 }
1971 else
1972 {
1973 return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
1974 }
1975 }
1976 // Create an anonymous tile with the tensor component
1977 else
1978 {
1979 assert(_arguments.has_tensor_argument(src.value()));
1980 auto x = _arguments[src.value()];
1981 const std::string val = x->component(to_tensor_component(src.type()));
1982 const DataType dt = x->component_data_type();
1983 return _tiles.insert({{{val}}}, dt);
1984 }
1985 }
1986
1987private:
1988 DataType to_tile_data_type(OperandType x)
1989 {
1990 return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
1991 }
1992
1993 TensorComponent to_tensor_component(OperandType x)
1994 {
1995 switch(x)
1996 {
1997 case OperandType::TensorDim0:
1998 return TensorComponent::Dim0;
1999 case OperandType::TensorDim1:
2000 return TensorComponent::Dim1;
2001 case OperandType::TensorDim2:
2002 return TensorComponent::Dim2;
2003 case OperandType::TensorDim3:
2004 return TensorComponent::Dim3;
2005 case OperandType::TensorDim4:
2006 return TensorComponent::Dim4;
2007 case OperandType::TensorStride1:
2008 return TensorComponent::Stride1;
2009 case OperandType::TensorStride2:
2010 return TensorComponent::Stride2;
2011 case OperandType::TensorStride3:
2012 return TensorComponent::Stride3;
2013 case OperandType::TensorStride4:
2014 return TensorComponent::Stride4;
2015 case OperandType::TensorDim1xDim2:
2016 return TensorComponent::Dim1xDim2;
2017 case OperandType::TensorDim1xDim2xDim3:
2018 return TensorComponent::Dim1xDim2xDim3;
2019 case OperandType::TensorDataOffset:
2020 return TensorComponent::OffsetFirstElement;
2021 default:
2022 assert(false);
2023 }
2024 }
2025
2026 GpuTileRegistry& _tiles;
2027 GpuTensorArgumentRegistry& _arguments;
2028};
2029
2030/**
2031 * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
2032 * declare an anonymous tile in the tile registry.
2033 * Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
2034 */
2035class TensorOperandUnpacker
2036{
2037public:
2038 TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
2039 {
2040 };
2041
2042 IGpuTensorArgument* unpack(const TensorOperand& src)
2043 {
2044 assert(_arguments.has_tensor_argument(src.value()));
2045 return _arguments[src.value()];
2046 }
2047
2048private:
2049 GpuTensorArgumentRegistry& _arguments;
2050};
2051
2052/**
2053 * @brief The GpuKernel will be used in three occasions (stages):
2054 * #- Compilation stage
2055 * #- Tuning stage
2056 * #- Dispatch stage
2057 */
2058struct GpuKernel
2059{
2060 // Compilation stage
2061 std::string code {}; // Source code, required for the compilation stage
2062 std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
2063 // Tuning stage
2064 std::string config_id {}; // Unique id, required for the tuning stage
2065 std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage
2066 // Dispatch stage
2067 GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
2068 std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages; // List of tensor storages, required for the dispatch stage
2069 std::vector<std::pair<int32_t, TensorComponent>> list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
2070};
2071
2072// This function should produce an object with the source
2073inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
2074{
2075 std::string code;
2076 code += "__kernel void ";
2077 code += name;
2078 code += "(\n";
2079
2080 auto IdSpaces = in.arguments.IdSpace_declarations();
2081
2082 std::vector<std::string> arg_str;
2083
2084 auto tensor_args = in.arguments.tensor_argument_declarations();
2085
2086 for(auto &i : tensor_args)
2087 {
2088 // For each tensor used, get the storage and tensor components
2089 auto storages = i->storage_declarations();
2090 auto components = i->component_declarations();
2091
2092 for(auto &y : storages)
2093 {
2094 std::string str;
2095 str += i->storage_type_declaration(y);
2096 str += " ";
2097 str += i->storage(y);
2098 arg_str.push_back(str);
2099 }
2100
2101 for(auto &y : components)
2102 {
2103 std::string str;
2104 str += i->component_type_declaration();
2105 str += " ";
2106 str += i->component(y);
2107 arg_str.push_back(str);
2108 }
2109 }
2110
2111 for(size_t i = 0; i < arg_str.size(); ++i)
2112 {
2113 code += arg_str[i];
2114 if(i + 1 < arg_str.size())
2115 {
2116 code += ",\n";
2117 }
2118 }
2119
2120 code += ")\n";
2121 code += "{\n";
2122 code += in.code;
2123 code += "}\n";
2124
2125 return code;
2126};
2127
2128/**
2129 * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
2130 * how to reduce the dimensionality of a tensor
2131 *
2132 */
2133class GpuTensor3dMapper
2134{
2135public:
2136 GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
2137 {
2138 };
2139
2140 std::string tensor_component_x() const
2141 {
2142 const auto format = _sampler.format;
2143 switch(format)
2144 {
2145 case TensorSamplerFormat::C_WH_1:
2146 case TensorSamplerFormat::C_W_H:
2147 return _tensor->component(TensorComponent::C);
2148 default:
2149 std::cout << "Unsupported tensor format" << std::endl;
2150 assert(false);
2151 }
2152 }
2153
2154 std::string tensor_component_y() const
2155 {
2156 const auto format = _sampler.format;
2157 switch(format)
2158 {
2159 case TensorSamplerFormat::C_WH_1:
2160 return _tensor->component(TensorComponent::WxH);
2161 case TensorSamplerFormat::C_W_H:
2162 return _tensor->component(TensorComponent::W);
2163 default:
2164 std::cout << "Unsupported tensor format" << std::endl;
2165 assert(false);
2166 }
2167 }
2168
2169 std::string tensor_component_z() const
2170 {
2171 const auto format = _sampler.format;
2172 switch(format)
2173 {
2174 case TensorSamplerFormat::C_WH_1:
2175 return "1";
2176 case TensorSamplerFormat::C_W_H:
2177 return _tensor->component(TensorComponent::H);
2178 default:
2179 std::cout << "Unsupported tensor format" << std::endl;
2180 assert(false);
2181 }
2182 }
2183
2184 std::string tensor_component_stride_y() const
2185 {
2186 const auto format = _sampler.format;
2187 switch(format)
2188 {
2189 case TensorSamplerFormat::C_WH_1:
2190 case TensorSamplerFormat::C_W_H:
2191 return _tensor->component(TensorComponent::Stride1);
2192 default:
2193 std::cout << "Unsupported tensor format" << std::endl;
2194 assert(false);
2195 }
2196 }
2197
2198 std::string tensor_component_stride_z() const
2199 {
2200 const auto format = _sampler.format;
2201 switch(format)
2202 {
2203 case TensorSamplerFormat::C_WH_1:
2204 return "0";
2205 case TensorSamplerFormat::C_W_H:
2206 return _tensor->component(TensorComponent::Stride2);
2207 default:
2208 std::cout << "Unsupported tensor format" << std::endl;
2209 assert(false);
2210 }
2211 }
2212
2213 std::string tensor_component_stride_batch() const
2214 {
2215 const auto format = _sampler.format;
2216 switch(format)
2217 {
2218 case TensorSamplerFormat::C_WH_1:
2219 case TensorSamplerFormat::C_W_H:
2220 return _tensor->component(TensorComponent::Stride3);
2221 default:
2222 std::cout << "Unsupported tensor format" << std::endl;
2223 assert(false);
2224 }
2225 }
2226
2227 bool is_one_component_x() const
2228 {
2229 auto t = _tensor->format();
2230 const auto format = _sampler.format;
2231 switch(format)
2232 {
2233 case TensorSamplerFormat::C_WH_1:
2234 case TensorSamplerFormat::C_W_H:
2235 return t.shape[0] == 1;
2236 default:
2237 std::cout << "Unsupported tensor format" << std::endl;
2238 assert(false);
2239 }
2240 }
2241
2242 bool is_one_component_y() const
2243 {
2244 auto t = _tensor->format();
2245 const auto format = _sampler.format;
2246 switch(format)
2247 {
2248 case TensorSamplerFormat::C_WH_1:
2249 return (t.shape[1] * t.shape[2]) == 1;
2250 case TensorSamplerFormat::C_W_H:
2251 return t.shape[1] == 1;
2252 default:
2253 std::cout << "Unsupported tensor format" << std::endl;
2254 assert(false);
2255 }
2256 }
2257
2258 bool is_one_component_z() const
2259 {
2260 auto t = _tensor->format();
2261 const auto format = _sampler.format;
2262 switch(format)
2263 {
2264 case TensorSamplerFormat::C_WH_1:
2265 return true;
2266 case TensorSamplerFormat::C_W_H:
2267 return t.shape[2] == 1;
2268 default:
2269 std::cout << "Unsupported tensor format" << std::endl;
2270 assert(false);
2271 }
2272 }
2273
2274 bool is_one_component_batch() const
2275 {
2276 auto t = _tensor->format();
2277 const auto format = _sampler.format;
2278 switch(format)
2279 {
2280 case TensorSamplerFormat::C_WH_1:
2281 case TensorSamplerFormat::C_W_H:
2282 return t.shape[3] == 1;
2283 default:
2284 std::cout << "Unsupported tensor format" << std::endl;
2285 assert(false);
2286 }
2287 }
2288
2289 GpuSampler gpu_sampler() const
2290 {
2291 return _sampler;
2292 }
2293
2294 IGpuTensorArgument* tensor_argument() const
2295 {
2296 return _tensor;
2297 }
2298
2299private:
2300 GpuSampler _sampler;
2301 IGpuTensorArgument* _tensor;
2302};
2303
2304struct GpuKernelWriterAttribute
2305{
2306 bool return_tensor_component_by_value { false };
2307};
2308
2309enum class ConvertPolicy
2310{
2311 Wrap, /**< Wrap around */
2312 Saturate /**< Saturate */
2313};
2314
2315enum class RoundingMode
2316{
2317 None,
2318 Rte,
2319 Rtz,
2320 Rtp,
2321 Rtn
2322};
2323
2324// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
2325class IGpuKernelWriter
2326{
2327public:
2328 virtual ~IGpuKernelWriter() = default;
2329 virtual void set_IdSpace(int32_t id) = 0;
2330 virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
2331 virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
2332 virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
2333 virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
2334 virtual void write_text(const std::string& x) = 0;
2335 virtual void compound_statement_begin() = 0;
2336 virtual void compound_statement_end() = 0;
2337
2338 // Operations
2339 virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
2340 virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
2341 virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
2342 virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
2343 virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
2344 virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
2345 virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
2346 virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
2347 virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
2348 virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
2349 virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
2350 virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
2351 virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
2352 virtual void op_return() = 0;
2353 // virtual void op_else() = 0;
2354 // virtual void op_elseif() = 0;
2355 // Utils
2356 // It is the process of converting
2357 virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
2358};
2359
2360enum class GpuLoadStoreType
2361{
2362 Load = 1,
2363 Store = 2
2364};
2365
2366class IGpuLoadStoreHelperWriter
2367{
2368public:
2369 IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
2370 IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
2371 IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
2372 virtual ~IGpuLoadStoreHelperWriter() = default;
2373 virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
2374 virtual void write(const std::pair<int32_t, std::string>& y) = 0;
2375 virtual void finalize() = 0;
2376protected:
2377 IGpuKernelWriter* _writer;
2378 GpuTensor3dMapper _mapper;
2379 GpuLoadStoreType _type;
2380};
2381
2382class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
2383{
2384public:
2385 ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
2386 {
2387 }
2388
2389 ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
2390 ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
2391
2392 static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
2393 {
2394 CKW_UNUSED(x, type, dst);
2395
2396 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
2397 {
2398 return false;
2399 }
2400 return true;
2401 }
2402
2403 void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
2404 {
2405 assert(validate(_writer, _mapper, _type, dst));
2406
2407 _dst = dst;
2408 _ls_width_full = dst->format().w;
2409
2410 _coord_x = x->scalar(0, 0).str;
2411 _coord_z = z->scalar(0, 0).str;
2412 _coord_b = b->scalar(0, 0).str;
2413 _coord_orig_z = _coord_z;
2414
2415 out_of_bound_initialize_x(_coord_x);
2416 out_of_bound_initialize_z(_coord_z);
2417
2418 /*
2419 meaning of else:
2420 - x: partial load/store
2421 - y: no load/store operation
2422 - z: no load/store operation
2423 if(x)
2424 {
2425 if(z)
2426 {
2427 if(y)
2428 {
2429 // full load/store width
2430 }
2431 else
2432 {
2433 // no load/store
2434 }
2435 }
2436 else
2437 {
2438 // no load/store
2439 }
2440 }
2441 else
2442 {
2443 if(z)
2444 {
2445 if(y)
2446 {
2447 // partial load/store width
2448 }
2449 else
2450 {
2451 // no load/store
2452 }
2453 }
2454 else
2455 {
2456 // no load/store
2457 }
2458 }
2459 */
2460 }
2461
2462 void write(const std::pair<int32_t, std::string>& y) override
2463 {
2464 int32_t idx_y = y.first;
2465 std::string coord_y = y.second;
2466
2467 // The only check required is on Y.
2468 out_of_bound_initialize_y(coord_y);
2469
2470 const std::string dst = _dst->vector(idx_y).str;
2471 const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
2472 const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address);
2473
2474 _writer->write_text(ls_buf);
2475 _writer->write_text(";\n");
2476
2477 out_of_bound_finalize_y(dst);
2478
2479 // The left over load/store will be written in the finalize stage
2480 if(_ls_width_part.size() != 0)
2481 {
2482 int32_t w = 0;
2483 for(auto &p : _ls_width_part)
2484 {
2485 const std::string dst0 = _dst->vector(w, p, idx_y).str;
2486 const std::string coord_x = _coord_x + " + " + std::to_string(w);
2487 const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
2488 const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
2489 _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
2490
2491 w += p;
2492 }
2493 }
2494 }
2495
2496 void finalize() override
2497 {
2498 out_of_bound_finalize_z();
2499 out_of_bound_finalize_x();
2500 }
2501private:
2502 IVectorTile* _dst { nullptr };
2503 int32_t _ls_width_full { 0 };
2504 std::vector<int32_t> _ls_width_part { };
2505 std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
2506 std::string _coord_x {};
2507 std::string _coord_z {};
2508 std::string _coord_orig_z {};
2509 std::string _coord_b {};
2510
2511 void out_of_bound_initialize_x(std::string& coord)
2512 {
2513 if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
2514 {
2515 auto tensor_format = _mapper.tensor_argument()->format();
2516 auto shape = tensor_format.shape;
2517
2518 _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
2519 if(_ls_width_part.size() != 0)
2520 {
2521 _writer->write_text("if(" + coord + " > 0)\n");
2522 _writer->compound_statement_begin();
2523 }
2524 }
2525 };
2526
2527 void out_of_bound_finalize_x()
2528 {
2529 if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
2530 {
2531 if(_ls_width_part.size() != 0)
2532 {
2533 _writer->compound_statement_end();
2534 _writer->write_text("else\n");
2535 _writer->compound_statement_begin();
2536
2537 out_of_bound_initialize_z(_coord_orig_z);
2538 for(auto &i : _leftovers_x)
2539 {
2540 out_of_bound_initialize_y(i.first.second);
2541 _writer->write_text(i.second);
2542 _writer->write_text(";\n");
2543 out_of_bound_finalize_y(i.first.first);
2544 }
2545 out_of_bound_finalize_z();
2546 _writer->compound_statement_end();
2547 }
2548 }
2549 };
2550
2551 void out_of_bound_initialize_y(std::string& coord)
2552 {
2553 std::string max = "";
2554
2555 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2556
2557 switch(address_mode_y)
2558 {
2559 case TensorSamplerAddressModeY::Skip:
2560 case TensorSamplerAddressModeY::ClampToBorder:
2561 // NOTE: This line should not be moved outside of the switch statement.
2562 // The reason for that is because when we query the component, the component is marked as used
2563 // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
2564 // we should request the component only when used
2565 max = _mapper.tensor_component_y();
2566 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2567 _writer->compound_statement_begin();
2568 break;
2569 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2570 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2571 _writer->write_text("if(" + coord + " >= 0)\n");
2572 _writer->compound_statement_begin();
2573 break;
2574 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2575 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2576 max = _mapper.tensor_component_y();
2577 _writer->write_text("if(" + coord + " < " + max + ")\n");
2578 _writer->compound_statement_begin();
2579 break;
2580 case TensorSamplerAddressModeY::ClampToNearest:
2581 max = _mapper.tensor_component_y();
2582 coord = "clamp(" + coord + ", 0, " + max + " - 1)";
2583 break;
2584 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
2585 max = _mapper.tensor_component_y();
2586 coord = "min(" + coord + ", " + max + " - 1)";
2587 break;
2588 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
2589 coord = "max(" + coord + ", 0)";
2590 break;
2591 case TensorSamplerAddressModeY::None:
2592 break;
2593 default:
2594 std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
2595 assert(false);
2596 }
2597 };
2598
2599 void out_of_bound_finalize_y(const std::string& dst)
2600 {
2601 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2602
2603 switch(address_mode_y)
2604 {
2605 case TensorSamplerAddressModeY::ClampToBorder:
2606 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2607 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2608 case TensorSamplerAddressModeY::Skip:
2609 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2610 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2611 _writer->compound_statement_end();
2612 break;
2613
2614 default:
2615 assert(false);
2616 }
2617
2618 switch(address_mode_y)
2619 {
2620 case TensorSamplerAddressModeY::ClampToBorder:
2621 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2622 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2623 _writer->write_text("else\n");
2624 _writer->compound_statement_begin();
2625 _writer->write_text(dst);
2626 _writer->write_text(" = 0.0f;\n");
2627 _writer->compound_statement_end();
2628 break;
2629
2630 default:
2631 assert(false);
2632 }
2633 };
2634
2635 void out_of_bound_initialize_z(std::string& coord)
2636 {
2637 std::string max = "";
2638
2639 const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
2640
2641 switch(address_mode_z)
2642 {
2643 case TensorSamplerAddressModeZ::Skip:
2644 max = _mapper.tensor_component_z();
2645 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2646 _writer->compound_statement_begin();
2647 break;
2648 case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
2649 _writer->write_text("if(" + coord + " >= 0)\n");
2650 _writer->compound_statement_begin();
2651 break;
2652 case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
2653 max = _mapper.tensor_component_z();
2654 _writer->write_text("if(" + coord + " < " + max + ")\n");
2655 _writer->compound_statement_begin();
2656 break;
2657 case TensorSamplerAddressModeZ::ClampToNearest:
2658 max = _mapper.tensor_component_z();
2659 coord = "clamp(" + coord + ", 0, " + max + " - 1)";
2660 break;
2661 case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
2662 max = _mapper.tensor_component_z();
2663 coord = "min(" + coord + ", " + max + " - 1)";
2664 break;
2665 case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
2666 coord = "max(" + coord + ", 0)";
2667 break;
2668 case TensorSamplerAddressModeZ::None:
2669 break;
2670 default:
2671 std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
2672 assert(false);
2673 }
2674 };
2675
2676 void out_of_bound_finalize_z()
2677 {
2678 const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
2679
2680 switch(address_mode_z)
2681 {
2682 case TensorSamplerAddressModeZ::Skip:
2683 case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
2684 case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
2685 _writer->compound_statement_end();
2686 break;
2687
2688 default:
2689 assert(false);
2690 }
2691 };
2692
2693 std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
2694 {
2695 std::vector<int32_t> x;
2696
2697 switch(ls_leftover_vector_width)
2698 {
2699 case 0:
2700 break;
2701 case 1:
2702 case 2:
2703 case 3:
2704 case 4:
2705 case 8:
2706 case 16:
2707 x.push_back(ls_leftover_vector_width);
2708 break;
2709 case 5:
2710 x.push_back(4);
2711 x.push_back(1);
2712 break;
2713 case 6:
2714 x.push_back(4);
2715 x.push_back(2);
2716 break;
2717 case 7:
2718 x.push_back(4);
2719 x.push_back(3);
2720 break;
2721 case 9:
2722 x.push_back(8);
2723 x.push_back(1);
2724 break;
2725 case 10:
2726 x.push_back(8);
2727 x.push_back(2);
2728 break;
2729 case 11:
2730 x.push_back(8);
2731 x.push_back(3);
2732 break;
2733 case 12:
2734 x.push_back(8);
2735 x.push_back(4);
2736 break;
2737 case 13:
2738 x.push_back(8);
2739 x.push_back(4);
2740 x.push_back(1);
2741 break;
2742 case 14:
2743 x.push_back(8);
2744 x.push_back(4);
2745 x.push_back(2);
2746 break;
2747 case 15:
2748 x.push_back(8);
2749 x.push_back(4);
2750 x.push_back(3);
2751 break;
2752
2753 default:
2754 assert(false);
2755 }
2756 return x;
2757 }
2758
2759 std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
2760 {
2761 switch(type)
2762 {
2763 case GpuLoadStoreType::Load:
2764 if(vector_width != 1)
2765 {
2766 return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
2767 }
2768 else
2769 {
2770 return data + " = *(" + address + ")";
2771 }
2772 break;
2773 case GpuLoadStoreType::Store:
2774 if(vector_width != 1)
2775 {
2776 return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
2777 }
2778 else
2779 {
2780 return "*(" + address + ") = " + data;
2781 }
2782 break;
2783 default:
2784 std::cout << "Unsupported GpuLoadStoreType" << std::endl;
2785 assert(false);
2786 }
2787 }
2788
2789 std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
2790 {
2791 auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
2792 assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
2793 const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage);
2794 const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
2795
2796 std::string address;
2797 address += "(__global ";
2798 address += dst_type;
2799 address += "*)(";
2800 address += ptr_buf;
2801 if(x != "0" && (_mapper.is_one_component_x() != true))
2802 {
2803 address += " + (";
2804 address += x + ") * sizeof(" + dst_type + ")";
2805 }
2806 if(y != "0" && (_mapper.is_one_component_y() != true))
2807 {
2808 const std::string stride_y = _mapper.tensor_component_stride_y();
2809 address += " + (";
2810 address += y + ")";
2811 address += " * ";
2812 address += stride_y;
2813 }
2814 if(z != "0" && (_mapper.is_one_component_z() != true))
2815 {
2816 const std::string stride_z = _mapper.tensor_component_stride_z();
2817 address += " + (";
2818 address += z + ")";
2819 address += " * ";
2820 address += stride_z;
2821 }
2822 if(b != "0" && (_mapper.is_one_component_batch() != true))
2823 {
2824 const std::string stride_b = _mapper.tensor_component_stride_batch();
2825 address += " + (";
2826 address += b + ")";
2827 address += " * ";
2828 address += stride_b;
2829 }
2830 address += ")";
2831 return address;
2832 }
2833};
2834
2835class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
2836{
2837public:
2838 static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile *dst)
2839 {
2840 CKW_UNUSED(x);
2841
2842 if(dst->format().w != 4)
2843 {
2844 return false;
2845 }
2846 if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
2847 {
2848 return false;
2849 }
2850 if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
2851 {
2852 return false;
2853 }
2854 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
2855 {
2856 return false;
2857 }
2858 if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
2859 {
2860 return false;
2861 }
2862 if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
2863 {
2864 return false;
2865 }
2866 return true;
2867 /*
2868 - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
2869 - z: Only GpuSamplerAddressModeZ::None is supported
2870 */
2871 }
2872 ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
2873 {
2874 }
2875
2876 ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
2877 ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
2878
2879 void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
2880 {
2881 assert(validate(_writer, _mapper, _type, dst));
2882
2883 _dst = dst;
2884 _ls_width_full = dst->format().w;
2885 _coord_x = x->scalar(0, 0).str;
2886 _coord_z = z->scalar(0, 0).str;
2887 _coord_b = b->scalar(0, 0).str;
2888
2889 /*
2890 if(y)
2891 {
2892 // full load/store width
2893 }
2894 else
2895 {
2896 // no load/store
2897 }
2898 */
2899 }
2900
2901 void write(const std::pair<int32_t, std::string>& y) override
2902 {
2903 int32_t idx_y = y.first;
2904 std::string coord_y = y.second;
2905
2906 // The only check required is on Y.
2907 out_of_bound_initialize_y(coord_y);
2908
2909 const std::string dst = _dst->vector(idx_y).str;
2910 const std::string sampler = to_ls_image2d_sampler();
2911 const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
2912 const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
2913
2914 _writer->write_text(ls_buf);
2915 _writer->write_text(";\n");
2916
2917 out_of_bound_finalize_y(dst);
2918 }
2919
2920 void finalize() override
2921 {
2922 }
2923private:
2924 IVectorTile* _dst { nullptr };
2925 int32_t _ls_width_full { 0 };
2926 std::string _coord_x {};
2927 std::string _coord_z {};
2928 std::string _coord_b {};
2929
2930 void out_of_bound_initialize_y(std::string& coord)
2931 {
2932 std::string max = "";
2933
2934 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2935
2936 switch(address_mode_y)
2937 {
2938 case TensorSamplerAddressModeY::Skip:
2939 max = _mapper.tensor_component_y();
2940 _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
2941 _writer->compound_statement_begin();
2942 break;
2943 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2944 _writer->write_text("if(" + coord + " >= 0)\n");
2945 _writer->compound_statement_begin();
2946 break;
2947 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2948 max = _mapper.tensor_component_y();
2949 _writer->write_text("if(" + coord + " < " + max + ")\n");
2950 _writer->compound_statement_begin();
2951 break;
2952 case TensorSamplerAddressModeY::ClampToBorder:
2953 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
2954 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
2955 case TensorSamplerAddressModeY::ClampToNearest:
2956 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
2957 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
2958 case TensorSamplerAddressModeY::None:
2959 break;
2960 default:
2961 std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
2962 assert(false);
2963 }
2964 };
2965
2966 void out_of_bound_finalize_y(const std::string& dst)
2967 {
2968 CKW_UNUSED(dst);
2969
2970 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
2971
2972 switch(address_mode_y)
2973 {
2974 case TensorSamplerAddressModeY::Skip:
2975 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
2976 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
2977 _writer->compound_statement_end();
2978 break;
2979
2980 default:
2981 assert(false);
2982 }
2983 };
2984
2985 std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
2986 {
2987 CKW_UNUSED(vector_width);
2988
2989 auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
2990 const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage);
2991 // const DataType dt = _dst->format().dt;
2992 const std::string post_fix = _dst->format().dt == DataType::Fp32? "f" : "h";
2993
2994 switch(type)
2995 {
2996 case GpuLoadStoreType::Load:
2997 return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
2998 break;
2999 case GpuLoadStoreType::Store:
3000 return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
3001 default:
3002 assert(false);
3003 std::cout << "Unsupported GpuLoadStoreType" << std::endl;
3004 assert(false);
3005 }
3006 }
3007
3008 std::string to_ls_image2d_sampler() const
3009 {
3010 const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
3011
3012 switch(address_mode_y)
3013 {
3014 case TensorSamplerAddressModeY::None:
3015 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
3016 case TensorSamplerAddressModeY::Skip:
3017 case TensorSamplerAddressModeY::SkipMinEdgeOnly:
3018 case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
3019 case TensorSamplerAddressModeY::ClampToBorder:
3020 case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
3021 case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
3022 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
3023 case TensorSamplerAddressModeY::ClampToNearest:
3024 case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
3025 case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
3026 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
3027 default:
3028 std::cout << "Unsupported address_mode_coord" << std::endl;
3029 assert(false);
3030 }
3031 }
3032
3033 std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
3034 {
3035 std::string coord_x = "(" + x + ") >> 2";
3036 std::string coord_y = "(";
3037
3038 if(y != "0" && (_mapper.is_one_component_y() != true))
3039 {
3040 coord_y += y;
3041 }
3042 if(z != "0" && (_mapper.is_one_component_z() != true))
3043 {
3044 const std::string dim = _mapper.tensor_component_y();
3045 coord_y += " + (";
3046 coord_y += z + ")";
3047 coord_y += " * ";
3048 coord_y += dim;
3049 }
3050 if(b != "0" && (_mapper.is_one_component_batch() != true))
3051 {
3052 const std::string dim0 = _mapper.tensor_component_y();
3053 const std::string dim1 = _mapper.tensor_component_z();
3054 coord_y += " + (";
3055 coord_y += b + ")";
3056 coord_y += " * ";
3057 coord_y += dim0;
3058 coord_y += " * ";
3059 coord_y += dim1;
3060 }
3061 coord_y += ")";
3062 return "(int2)(" + coord_x + ", " + coord_y + ")";
3063 }
3064};
3065
3066/** IGpuLoadStoreHelperWriter factory class */
3067class ClLoadStoreHelperWriterFactory final
3068{
3069public:
3070 /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
3071 *
3072 *
3073 * @return IGpuLoadStoreHelperWriter
3074 */
3075 static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
3076 {
3077 const auto tensor_storage = mapper.gpu_sampler().storage;
3078 switch(tensor_storage)
3079 {
3080 case GpuSamplerTensorStorage::BufferUint8Ptr:
3081 return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
3082 case GpuSamplerTensorStorage::Image2dReadOnly:
3083 case GpuSamplerTensorStorage::Image2dWriteOnly:
3084 return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
3085 default:
3086 std::cout << "Unsupported Gpu tensor storage" << std::endl;
3087 assert(false);
3088 }
3089 }
3090};
3091
3092// This utility method needs to go in utils.h
3093inline bool is_tile_scalar(IVectorTile* x)
3094{
3095 return x->format().w == 1 && x->format().h == 1;
3096}
3097
3098class ClKernelWriter : public IGpuKernelWriter
3099{
3100public:
3101 ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
3102 {
3103 _data = x;
3104 _attr = attr;
3105 }
3106
3107 ClKernelWriter(const ClKernelWriter &) = default;
3108 ClKernelWriter &operator=(const ClKernelWriter &) = default;
3109
3110 // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
3111 // there are no conflicts or ambiguity in the code
3112 void set_IdSpace(int32_t id) override
3113 {
3114 _data->tiles.set_IdSpace(id);
3115 _data->arguments.set_IdSpace(id);
3116 }
3117
3118 void import_tile(const std::string& dst_name, const IVectorTile *src) override
3119 {
3120 _data->tiles.insert(dst_name, src);
3121 }
3122
3123 void declare_argument(const std::string& name, const TensorInfo& tensor) override
3124 {
3125 assert(_data->arguments[name] == nullptr);
3126 _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
3127 }
3128
3129 void declare_tile(const std::string& name, const TileInfo& format) override
3130 {
3131 assert(_data->tiles[name] == nullptr);
3132 _data->tiles.insert(name, format);
3133
3134 IVectorTile *x = _data->tiles[name];
3135
3136 for(auto &t : x->underlying_source_variables())
3137 {
3138 _data->code += t.type.str + " " + t.str + ";\n";
3139 }
3140 }
3141
3142 void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
3143 {
3144 assert(_data->tiles[name] == nullptr);
3145 _data->tiles.insert(name, in, dt);
3146 // Note: A constant does not need to be declared in the code
3147 }
3148
3149 void write_text(const std::string& x) override
3150 {
3151 _data->code += x;
3152 }
3153
3154 void compound_statement_begin() override
3155 {
3156 _data->tiles.increment_registry_level();
3157 _data->code += "{\n";
3158 }
3159
3160 void compound_statement_end() override
3161 {
3162 _data->tiles.decrement_registry_level();
3163 _data->code += "}\n";
3164 }
3165
3166 void op_get_global_id(const Operand& dst_var, int32_t dim) override
3167 {
3168 assert(dst_var.type() == OperandType::Tile);
3169 assert(_data->tiles.has_tile(dst_var.value()));
3170 assert(_data->tiles[dst_var.value()]->format().w == 1 &&
3171 _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
3172
3173 auto var = _data->tiles[dst_var.value()];
3174
3175 _data->code += var->scalar(0, 0).str;
3176 _data->code += " = get_global_id(";
3177 _data->code += std::to_string(dim);
3178 _data->code += ");\n";
3179 };
3180
3181 void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
3182 {
3183 OperandUnpacker operands(_data->tiles, _data->arguments);
3184 auto dst = operands.unpack(o_dst);
3185 auto step = operands.unpack(o_step);
3186
3187 // Validation: Check that x, y and z are scalar
3188
3189 TensorOperandUnpacker tensor_operands(_data->arguments);
3190 auto tensor = tensor_operands.unpack(o_tensor);
3191 auto gpu_sampler = o_tensor.sampler();
3192
3193 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3194
3195 switch (dim)
3196 {
3197 case 0:
3198 if(mapper.is_one_component_x())
3199 {
3200 _data->code += dst->scalar(0, 0).str;
3201 _data->code += " = 0;\n";
3202 }
3203 else
3204 {
3205 if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
3206 {
3207 // Validation: Check: fixed tensor shape
3208 // TO BE CHANGED
3209 _data->code += dst->scalar(0, 0).str;
3210 _data->code += " = get_global_id(0) * ";
3211 _data->code += step->scalar(0, 0).str;
3212 _data->code += ";\n";
3213 }
3214 else
3215 {
3216 _data->code += dst->scalar(0, 0).str;
3217 _data->code += " = get_global_id(0) * ";
3218 _data->code += step->scalar(0, 0).str;
3219 _data->code += ";\n";
3220 }
3221 }
3222 break;
3223 case 1:
3224 if(mapper.is_one_component_y())
3225 {
3226 _data->code += dst->scalar(0, 0).str;
3227 _data->code += " = 0;\n";
3228 }
3229 else
3230 {
3231 if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
3232 {
3233
3234 }
3235 else
3236 {
3237 _data->code += dst->scalar(0, 0).str;
3238 _data->code += " = get_global_id(1) * ";
3239 _data->code += step->scalar(0, 0).str;
3240 _data->code += ";\n";
3241 }
3242 }
3243 break;
3244 case 2:
3245 if(mapper.is_one_component_z())
3246 {
3247 _data->code += dst->scalar(0, 0).str;
3248 _data->code += " = 0;\n";
3249 }
3250 else
3251 {
3252 _data->code += dst->scalar(0, 0).str;
3253 _data->code += " = get_global_id(2) * ";
3254 _data->code += step->scalar(0, 0).str;
3255 _data->code += ";\n";
3256 }
3257 break;
3258 default:
3259 break;
3260 }
3261 };
3262
3263 void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
3264 {
3265 OperandUnpacker operands(_data->tiles, _data->arguments);
3266 auto dst = operands.unpack(o_dst);
3267
3268 TensorOperandUnpacker tensor_operands(_data->arguments);
3269 auto tensor = tensor_operands.unpack(o_tensor);
3270 auto gpu_sampler = o_tensor.sampler();
3271
3272 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3273
3274 if(mapper.is_one_component_batch())
3275 {
3276 _data->code += dst->scalar(0, 0).str;
3277 _data->code += " = 0;\n";
3278 }
3279 else
3280 {
3281 std::cout << "Unsupported batched computation" << std::endl;
3282 assert(false);
3283 }
3284 };
3285
3286 void op_get_global_size(const Operand& dst_var, int32_t dim) override
3287 {
3288 assert(dst_var.type() == OperandType::Tile);
3289 assert(_data->tiles.has_tile(dst_var.value()));
3290 assert(_data->tiles[dst_var.value()]->format().w == 1 &&
3291 _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
3292
3293 auto var = _data->tiles[dst_var.value()];
3294
3295 _data->code += var->scalar(0, 0).str;
3296 _data->code += " = get_global_size(";
3297 _data->code += std::to_string(dim);
3298 _data->code += ");\n";
3299 }
3300
3301 void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
3302 {
3303 OperandUnpacker operands(_data->tiles, _data->arguments);
3304 auto lhs = operands.unpack(lhs_name);
3305 auto rhs = operands.unpack(rhs_name);
3306 auto dst = operands.unpack(dst_name);
3307
3308 const int32_t dst_w = dst->format().w;
3309 const int32_t dst_h = dst->format().h;
3310 assert(lhs != nullptr);
3311 const int32_t lhs_w = lhs->format().w;
3312 const int32_t rhs_w = rhs->format().w;
3313
3314 if(op == BinaryOp::MatMul_Nt_T)
3315 {
3316 assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
3317 for(int32_t y = 0; y < dst_h; ++y)
3318 {
3319 for(int32_t x = 0; x < dst_w; ++x)
3320 {
3321 for(int32_t k = 0; k < lhs_w; ++k)
3322 {
3323 _data->code += dst->scalar(x, y).str;
3324 _data->code += " = fma(";
3325 _data->code += lhs->scalar(k, y).str;
3326 _data->code += ", ";
3327 _data->code += rhs->scalar(k, x).str;
3328 _data->code += ", ";
3329 _data->code += dst->scalar(x, y).str;
3330 _data->code += ");\n";
3331 }
3332 }
3333 }
3334
3335 return;
3336 }
3337
3338 bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
3339 bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
3340
3341 std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
3342 std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
3343 std::string op_str = to_string(op);
3344
3345 // Broadcasting on Y is automatic
3346 for(int32_t y = 0; y < dst_h; ++y)
3347 {
3348 _data->code += dst->vector(y).str;
3349 _data->code += " = ";
3350 _data->code += lhs_prefix + lhs->vector(y).str;
3351 _data->code += " ";
3352 _data->code += op_str;
3353 _data->code += " ";
3354 _data->code += rhs_prefix + rhs->vector(y).str;
3355 _data->code += ";\n";
3356 }
3357 };
3358
3359 void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
3360 {
3361 CKW_UNUSED(policy);
3362
3363 OperandUnpacker operands(_data->tiles, _data->arguments);
3364 auto src = operands.unpack(o_src);
3365 auto dst = operands.unpack(o_dst);
3366
3367 // const int32_t dst_w = dst->format().w;
3368 const int32_t dst_h = dst->format().h;
3369 const std::string dt = dst->scalar(0, 0).type.str;
3370
3371 // Broadcasting on Y is automatic
3372 for(int32_t y = 0; y < dst_h; ++y)
3373 {
3374 _data->code += dst->vector(y).str;
3375 _data->code += " = convert_" + dt + "(";
3376 _data->code += src->vector(y).str;
3377 _data->code += ");\n";
3378 }
3379 };
3380
3381 void op_assign(const Operand& dst_name, const Operand& src_name) override
3382 {
3383 OperandUnpacker operands(_data->tiles, _data->arguments);
3384 auto src = operands.unpack(src_name);
3385 auto dst = operands.unpack(dst_name);
3386
3387 const int32_t dst_w = dst->format().w;
3388 const int32_t dst_h = dst->format().h;
3389 const int32_t src_w = src->format().w;
3390 // const int32_t src_h = src->format().h;
3391 const std::string dt = dst->scalar(0, 0).type.str;
3392
3393 bool broadcast_src_x = dst_w != 1 && src_w == 1;
3394
3395 std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
3396
3397 // Broadcasting on Y is automatic
3398 for(int32_t y = 0; y < dst_h; ++y)
3399 {
3400 _data->code += dst->vector(y).str;
3401 _data->code += " = ";
3402 _data->code += src_prefix + src->vector(y).str;
3403 _data->code += ";\n";
3404 }
3405 }
3406
3407 void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
3408 {
3409 OperandUnpacker operands(_data->tiles, _data->arguments);
3410 auto src = operands.unpack(src_name);
3411 auto dst = operands.unpack(dst_name);
3412
3413 const int32_t dst_w = dst->format().w;
3414 const int32_t dst_h = dst->format().h;
3415 const int32_t src_w = src->format().w;
3416 // const int32_t src_h = src->format().h;
3417 const std::string dt = dst->scalar(0, 0).type.str;
3418
3419 bool broadcast_src_x = dst_w != 1 && src_w == 1;
3420
3421 std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
3422
3423 // Broadcasting on Y is automatic
3424 for(int32_t y = 0; y < dst_h; ++y)
3425 {
3426 _data->code += dst->vector(y).str;
3427 _data->code += " = ";
3428
3429 switch(func)
3430 {
3431 case ScalarUnaryFunction::Exp:
3432 _data->code += "exp(";
3433 break;
3434
3435 default:
3436 CKW_ASSERT(false);
3437 }
3438
3439 _data->code += src_prefix + src->vector(y).str;
3440 _data->code += ");\n";
3441 }
3442 }
3443
3444 void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
3445 {
3446 OperandUnpacker operands(_data->tiles, _data->arguments);
3447 auto lhs = operands.unpack(o_lhs);
3448 auto rhs = operands.unpack(o_rhs);
3449
3450 assert(is_tile_scalar(lhs));
3451 assert(is_tile_scalar(rhs));
3452
3453 _data->code += "if(";
3454 _data->code += lhs->scalar(0, 0).str;
3455 _data->code += " ";
3456 _data->code += to_string(op);
3457 _data->code += " ";
3458 _data->code += rhs->scalar(0, 0).str;
3459 _data->code += ")\n";
3460 }
3461
3462 void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
3463 {
3464 OperandUnpacker operands(_data->tiles, _data->arguments);
3465 auto var = operands.unpack(var_name);
3466 auto cond_value = operands.unpack(cond_value_name);
3467 auto update_value = operands.unpack(update_value_name);
3468
3469 const int32_t dst_w = var->format().w;
3470 const int32_t dst_h = var->format().h;
3471
3472 // It must be a scalar variable
3473 assert(dst_w == 1);
3474 assert(dst_h == 1);
3475
3476 _data->code += "for(; " ;
3477 _data->code += var->scalar(0, 0).str;
3478 _data->code += " ";
3479 _data->code += to_string(cond_op);
3480 _data->code += " " + cond_value->scalar(0, 0).str + "; ";
3481 _data->code += var->scalar(0, 0).str;
3482 _data->code += " ";
3483 _data->code += to_string(update_op);
3484 _data->code += " " + update_value->scalar(0, 0).str + ")";
3485 _data->code += "\n";
3486 }
3487
3488 void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
3489 {
3490 OperandUnpacker operands(_data->tiles, _data->arguments);
3491 auto dst = operands.unpack(o_dst);
3492 auto x = operands.unpack(o_x);
3493 auto y = operands.unpack(o_y);
3494 auto z = operands.unpack(o_z);
3495 auto dil_y = operands.unpack(dilation_y);
3496 auto b = operands.unpack(o_batch_idx);
3497
3498 TensorOperandUnpacker tensor_operands(_data->arguments);
3499 auto tensor = tensor_operands.unpack(o_tensor);
3500 auto gpu_sampler = o_tensor.sampler();
3501
3502 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3503
3504 auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
3505
3506 // Initialize the constant part
3507 load_writer->initialize(dst, x, z, b);
3508
3509 for(int i = 0; i < dst->format().h; ++i)
3510 {
3511 std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
3512 if(dil_y->scalar(0, 0).str != "1")
3513 {
3514 coord_y += " * " + dil_y->scalar(0, 0).str;
3515 }
3516 load_writer->write(std::make_pair(i, coord_y));
3517 }
3518
3519 load_writer->finalize();
3520 }
3521
3522 void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
3523 {
3524 OperandUnpacker operands(_data->tiles, _data->arguments);
3525 auto dst = operands.unpack(o_dst);
3526 auto x = operands.unpack(o_x);
3527 auto y_ind = operands.unpack(o_indirect_h);
3528 auto z = operands.unpack(o_z);
3529 auto b = operands.unpack(o_batch_idx);
3530
3531 TensorOperandUnpacker tensor_operands(_data->arguments);
3532 auto tensor = tensor_operands.unpack(o_tensor);
3533 auto gpu_sampler = o_tensor.sampler();
3534
3535 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3536
3537 auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
3538
3539 // Initialize the constant part
3540 load_writer->initialize(dst, x, z, b);
3541
3542 for(int i = 0; i < dst->format().h; ++i)
3543 {
3544 load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
3545 }
3546
3547 load_writer->finalize();
3548 }
3549
3550 void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
3551 {
3552 OperandUnpacker operands(_data->tiles, _data->arguments);
3553 auto src = operands.unpack(src_name);
3554 auto x = operands.unpack(x_name);
3555 auto y = operands.unpack(y_name);
3556 auto z = operands.unpack(z_name);
3557 auto b = operands.unpack(batch_index_name);
3558
3559 TensorOperandUnpacker tensor_operands(_data->arguments);
3560 auto tensor = tensor_operands.unpack(tensor_name);
3561 auto gpu_sampler = tensor_name.sampler();
3562
3563 GpuTensor3dMapper mapper(tensor, gpu_sampler);
3564
3565 auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
3566
3567 // Initialize the constant part
3568 store_writer->initialize(src, x, z, b);
3569
3570 int32_t tile_h = src->format().h;
3571
3572 for(int m0 = tile_h - 1; m0 >= 0; m0--)
3573 {
3574 store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
3575 }
3576
3577 store_writer->finalize();
3578 }
3579
3580 void op_return() override
3581 {
3582 _data->code += "return;\n";
3583 }
3584
3585 void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
3586 {
3587 OperandUnpacker operands(_data->tiles, _data->arguments);
3588 auto dst = operands.unpack(o_dst);
3589 auto x = operands.unpack(o_x);
3590 auto y = operands.unpack(o_y);
3591 auto x_off = operands.unpack(o_x_off);
3592 auto y_off = operands.unpack(o_y_off);
3593
3594 TensorOperandUnpacker tensor_operands(_data->arguments);
3595 auto tensor = tensor_operands.unpack(o_tensor);
3596
3597 assert(dst->format().w == 1);
3598 assert(x->format().w == 1);
3599 assert(y->format().w == 1);
3600 assert(x_off->format().w == 1);
3601 assert(y_off->format().w == 1);
3602 assert(dst->format().dt == DataType::Int32);
3603 assert(x->format().dt == DataType::Int32);
3604 assert(y->format().dt == DataType::Int32);
3605 assert(x_off->format().dt == DataType::Int32);
3606 assert(y_off->format().dt == DataType::Int32);
3607
3608 const std::string width = tensor->component(TensorComponent::W);
3609 const std::string height = tensor->component(TensorComponent::H);
3610 const std::string wxh = tensor->component(TensorComponent::WxH);
3611 /*
3612 int x_s;
3613 int y_s;
3614 x_s = (xi_0 + x_k);
3615 y_s = (yi_0 + y_k);
3616 mi_0 = x_s + y_s * width + b * widthxheight;
3617 mi_0 = select(-1, mi_0, x_s >= 0);
3618 mi_0 = select(-1, mi_0, y_s >= 0);
3619 mi_0 = select(-1, mi_0, x_s < 128);
3620 mi_0 = select(-1, mi_0, y_s < 128);
3621 */
3622 compound_statement_begin();
3623 declare_tile("_x_s", TileInfo(DataType::Int32));
3624 declare_tile("_y_s", TileInfo(DataType::Int32));
3625 auto x_s = operands.unpack(Operand("_x_s"));
3626 auto y_s = operands.unpack(Operand("_y_s"));
3627 for(int i = 0; i < dst->format().h; ++i)
3628 {
3629 // x_s = (xi_0 + x_k);
3630 // y_s = (yi_0 + y_k);
3631 _data->code += x_s->scalar(0, i).str;
3632 _data->code += " = (";
3633 _data->code += x->scalar(0, i).str;
3634 _data->code += " + ";
3635 _data->code += x_off->scalar(0, i).str;
3636 _data->code += ");\n";
3637 _data->code += y_s->scalar(0, i).str;
3638 _data->code += " = (";
3639 _data->code += y->scalar(0, i).str;
3640 _data->code += " + ";
3641 _data->code += y_off->scalar(0, i).str;
3642 _data->code += ");\n";
3643 // mi_0 = x_s + y_s * width;
3644 _data->code += dst->scalar(0, i).str;
3645 _data->code += " = ";
3646 _data->code += x_s->scalar(0, i).str;
3647 _data->code += " + ";
3648 _data->code += y_s->scalar(0, i).str;
3649 _data->code += " * " + width + ";\n";
3650 // mi_0 = select(wxh, mi_0, x_s >= 0);
3651 _data->code += dst->scalar(0, i).str;
3652 _data->code += " = select(-1, ";
3653 _data->code += dst->scalar(0, i).str;
3654 _data->code += ", ";
3655 _data->code += x_s->scalar(0, i).str;
3656 _data->code += " >= 0);\n";
3657 // mi_0 = select(wxh, mi_0, y_s >= 0);
3658 _data->code += dst->scalar(0, i).str;
3659 _data->code += " = select(-1, ";
3660 _data->code += dst->scalar(0, i).str;
3661 _data->code += ", ";
3662 _data->code += y_s->scalar(0, i).str;
3663 _data->code += " >= 0);\n";
3664 // mi_0 = select(wxh, mi_0, x_s < width);
3665 _data->code += dst->scalar(0, i).str;
3666 _data->code += " = select(-1, ";
3667 _data->code += dst->scalar(0, i).str;
3668 _data->code += ", ";
3669 _data->code += x_s->scalar(0, i).str;
3670 _data->code += " < ";
3671 _data->code += width + ");\n";
3672 // mi_0 = select(wxh, mi_0, y_s < height);
3673 _data->code += dst->scalar(0, i).str;
3674 _data->code += " = select(-1, ";
3675 _data->code += dst->scalar(0, i).str;
3676 _data->code += ", ";
3677 _data->code += y_s->scalar(0, i).str;
3678 _data->code += " < ";
3679 _data->code += height + ");\n";
3680 }
3681 compound_statement_end();
3682 }
3683
3684private:
3685 GpuKernelWriterDataHolder* _data { nullptr };
3686 GpuKernelWriterAttribute * _attr { nullptr };
3687};
3688
3689/** IGpuKernelWriter factory class */
3690class GpuKernelWriterFactory final
3691{
3692public:
3693 /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
3694 *
3695 * @param[in] gpu GPU target
3696 *
3697 * @return IGpuKernelWriter
3698 */
3699 static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
3700 {
3701 switch(x->programming_language())
3702 {
3703 case GpuTargetLanguage::OpenCL:
3704 return std::make_unique<ClKernelWriter>(attr, x);
3705 default:
3706 std::cout << "Unsupported Gpu programming language" << std::endl;
3707 assert(false);
3708 }
3709 }
3710};
3711
3712inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
3713{
3714 auto tensor = tensor_info_id->shape;
3715
3716 int32_t dim[3] = {0};
3717
3718 switch(tensor_format)
3719 {
3720 case TensorSamplerFormat::C_W_H:
3721 dim[0] = tensor[0];
3722 dim[1] = tensor[1];
3723 dim[2] = tensor[2];
3724 break;
3725 case TensorSamplerFormat::C_WH_1:
3726 dim[0] = tensor[0];
3727 dim[1] = tensor[1] * tensor[2];
3728 dim[2] = 1;
3729 break;
3730 default:
3731 std::cout << "Unsupported tensor format" << std::endl;
3732 assert(false);
3733 break;
3734 }
3735
3736 return std::min(step, dim[idx]);
3737}
3738
3739} // namespace prototype
3740} // namespace ckw
3741
3742#endif // CKW_SRC_PROTOTYPE_H