blob: e4ed47e7e67a72d20f6bdb61b01449a2a30171a3 [file] [log] [blame]
Usama Arif0681e3b2019-04-25 14:28:07 +01001/*
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00002 * Copyright (c) 2019-2021 Arm Limited.
Usama Arif0681e3b2019-04-25 14:28:07 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +010024#include "activation_float_helpers.h"
Usama Arif0681e3b2019-04-25 14:28:07 +010025#include "helpers.h"
26
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010027/** Utility macro to access a vector with the scalar positions
28 *
29 * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
30 *
31 * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
32 * @param[in] n0 The number of consecutive columns to access. n0 + offset must be <= 16
33 * @param[in] x Vector to access
34 * @{
35 */
36#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
37#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
38
39// offset == 0
40#define scalar_access_0_1(x) ((x).s0)
41#define scalar_access_0_2(x) ((x).s01)
42#define scalar_access_0_3(x) ((x).s012)
43#define scalar_access_0_4(x) ((x).s0123)
44#define scalar_access_0_8(x) ((x).s01234567)
45#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
46
47// offset == 1
48#define scalar_access_1_1(x) ((x).s1)
49#define scalar_access_1_2(x) ((x).s12)
50#define scalar_access_1_3(x) ((x).s123)
51#define scalar_access_1_4(x) ((x).s1234)
52#define scalar_access_1_8(x) ((x).s12345678)
53
54// offset == 2
55#define scalar_access_2_1(x) ((x).s2)
56#define scalar_access_2_2(x) ((x).s23)
57#define scalar_access_2_3(x) ((x).s234)
58#define scalar_access_2_4(x) ((x).s2345)
59#define scalar_access_2_8(x) ((x).s23456789)
60
61// offset == 3
62#define scalar_access_3_1(x) ((x).s3)
63#define scalar_access_3_2(x) ((x).s34)
64#define scalar_access_3_3(x) ((x).s345)
65#define scalar_access_3_4(x) ((x).s3456)
66#define scalar_access_3_8(x) ((x).s3456789A)
67
68// offset == 4
69#define scalar_access_4_1(x) ((x).s4)
70#define scalar_access_4_2(x) ((x).s45)
71#define scalar_access_4_3(x) ((x).s456)
72#define scalar_access_4_4(x) ((x).s4567)
73#define scalar_access_4_8(x) ((x).s456789AB)
74
75// offset == 8
76#define scalar_access_8_1(x) ((x).s8)
77#define scalar_access_8_2(x) ((x).s89)
78#define scalar_access_8_3(x) ((x).s89A)
79#define scalar_access_8_4(x) ((x).s89AB)
80#define scalar_access_8_8(x) ((x).s89ABCDEF)
81
82// offset == 12
83#define scalar_access_12_1(x) ((x).sC)
84#define scalar_access_12_2(x) ((x).sCD)
85#define scalar_access_12_3(x) ((x).sCDE)
86#define scalar_access_12_4(x) ((x).sCDEF)
87
88// offset == 16
89#define scalar_access_16_1(x) ((x).sF)
90
91/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables.
92 * @name LOAD_TENSOR_ROW_n
93 *
94 * @param[in] N0 The number of columns to load
95 * @param[in] DATA_TYPE The data type of variables
96 * @param[in] BASENAME The basename of the destination variables for the loaded rows
97 * @param[in] PTR The base pointer
98 * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
99 * @param[in] STRIDE_Y The stride value in y-axis direction
100 * @param[in] Z The z-axis offset vector
101 * @{
102 */
103#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
104 ({})
105
106#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
107 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
108
109#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
110 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
111 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
112
113#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
114 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
115 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
116
117#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
118 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
119 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
120
121#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
122 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
123 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
124
125#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
126 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
127 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
128
129#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
130 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
131 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
132
133#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
134 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
135 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
136
137#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
138 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
139 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
140
141#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
142 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
143 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
144
145#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
146 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
147 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
148
149#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
150 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
151 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
152
153#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
154 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
155 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
158 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
159 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
160
161#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
162 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
163 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
164
165#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
166 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
167 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168/** @}*/ // end of group LOAD_TENSOR_ROW_n
169
170/** Load tensor (consecutive rows and columns) with Z offset.
171 * @name LOAD_TENSOR
172 *
173 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
174 * The data to load is expected to have consecutive names for each row.
175 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
176 * The Z offset is expected to have consecutive names.
177 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
178 *
179 * @param[in] M0 The number of consecutive rows
180 * @param[in] N0 The number of consecutive columns
181 * @param[in] DATA_TYPE The data type of the target
182 * @param[in] BASENAME The basename of the result variables
183 * @param[in] PTR The base pointer for the data
184 * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
185 * @param[in] STRIDE_Y The stride in y-axis direction
186 * @param[in] Z The z-axis offset vector
187 * @{
188 */
189#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
190#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
191/** @} */ // end of group LOAD_TENSOR
192
193/** Load 2D tensor (consecutive rows and columns) with Z offset.
194 * @name LOAD_TENSOR_M0Xn
195 *
196 * @param[in] M0 The number of rows to load [0-16]
197 * @param[in] N0 The number of columns to load [0-16]
198 * @param[in] DATA_TYPE The data type of variables
199 * @param[in] BASENAME The basename of the destination variables for the loaded rows
200 * @param[in] PTR The base pointer
201 * @param[in] STRIDE_Y The stride value in y-axis direction
202 * @param[in] Z The z-axis offset vector
203 * @{
204 */
205#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
206 ({})
207
208#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
209 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
210
211#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
212 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
213
214#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
215 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
216
217#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
218 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
219
220#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
221 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
222 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
223
224#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
225 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
226 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
227
228#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
229 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
230 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
231
232#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
233 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
234
235#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
236 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
237 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
238
239#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
240 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
241 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
242
243#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
244 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
245 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
246
247#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
248 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
249 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
250
251#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
252 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
253 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
254 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
255
256#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
257 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
258 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
259 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
260
261#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
262 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
263 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
264 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
265
266#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
267 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
268/** @}*/ // end of group LOAD_TENSOR_M0Xn
269
270/** Load 2D tensor (consecutive rows and columns) with Z offset.
271 * @name LOAD_TENSOR_M0XN0
272 *
273 * @param[in] M0 The number of consecutive rows [0-16]
274 * @param[in] N0 The number of consecutive columns [0-16]
275 * @param[in] DATA_TYPE The data type of the target
276 * @param[in] BASENAME The basename of the result variables
277 * @param[in] PTR The base pointer for the data
278 * @param[in] STRIDE_Y The stride in y-axis direction
279 * @param[in] Z The z-axis offset vector
280 * @{
281 */
282#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
283#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
284
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000285/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
286 * @name LOAD_ROW_n
287 *
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100288 * @param[in] N0 The number of columns to load
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000289 * @param[in] DATA_TYPE The data type of variables
290 * @param[in] BASENAME The basename of the destination variables for the loaded rows
291 * @param[in] PTR The base pointer
292 * @param[in] OFFSET The offset within a row
293 * @param[in] STRIDE_Y The stride value in y-axis direction
294 * @param[in] Z The z-axis offset vector
295 * @{
296 */
Usama Arif0681e3b2019-04-25 14:28:07 +0100297#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
298 VEC_DATA_TYPE(DATA_TYPE, N0) \
299 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
300
301#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
302 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
303 VEC_DATA_TYPE(DATA_TYPE, N0) \
304 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
305
306#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
307 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
308 VEC_DATA_TYPE(DATA_TYPE, N0) \
309 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
310
311#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
312 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
313 VEC_DATA_TYPE(DATA_TYPE, N0) \
314 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
315
316#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
317 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
318 VEC_DATA_TYPE(DATA_TYPE, N0) \
319 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
320
321#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
322 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
323 VEC_DATA_TYPE(DATA_TYPE, N0) \
324 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
325
326#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
327 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
328 VEC_DATA_TYPE(DATA_TYPE, N0) \
329 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
330
331#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
332 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
333 VEC_DATA_TYPE(DATA_TYPE, N0) \
334 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
335
336#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
337 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
338 VEC_DATA_TYPE(DATA_TYPE, N0) \
339 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
340
341#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
342 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
343 VEC_DATA_TYPE(DATA_TYPE, N0) \
344 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
345
346#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
347 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
348 VEC_DATA_TYPE(DATA_TYPE, N0) \
349 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
350
351#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
352 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
353 VEC_DATA_TYPE(DATA_TYPE, N0) \
354 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
355
356#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
357 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
358 VEC_DATA_TYPE(DATA_TYPE, N0) \
359 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
360
361#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
362 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
363 VEC_DATA_TYPE(DATA_TYPE, N0) \
364 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
365
366#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
367 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
368 VEC_DATA_TYPE(DATA_TYPE, N0) \
369 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
370
371#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
372 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
373 VEC_DATA_TYPE(DATA_TYPE, N0) \
374 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
375
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000376/** @}*/ // end of group LOAD_ROW_n
Usama Arif0681e3b2019-04-25 14:28:07 +0100377
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000378/** Load Blocks (consecutive rows and columns) with Z offset.
379 * @name LOAD_BLOCK
380 *
381 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
382 * The data to load is expected to have consecutive names for each row.
383 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
384 * The Z offset is expected to have consecutive names.
385 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
386 *
387 * @param[in] M0 The number of consecutive rows
388 * @param[in] N0 The number of consecutive columns
389 * @param[in] DATA_TYPE The data type of the target
390 * @param[in] BASENAME The basename of the result variables
391 * @param[in] PTR The base pointer for the data
392 * @param[in] OFFSET The offset within a row
393 * @param[in] STRIDE_Y The stride in y-axis direction
394 * @param[in] Z The z-axis offset vector
395 * @{
396 */
397#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
398#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
399/** @} */ // end of group LOAD_BLOCK
400
Giorgio Arenabde2f352021-09-07 14:15:28 +0100401/** Partially load the 0 to (n-1)th rows of the given variables
402 * @name LOAD_ROW_PARTIAL_n
403 * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0
404 *
405 * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
406 *
407 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
408 * @param[in] LOAD_N0 The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0
409 * @param[in] DATA_TYPE The data type of the vectors
410 * @param[in] BASENAME The basename of the variables
411 * @param[in] PTR The base pointer
412 * @param[in] OFFSET The offset within a row
413 * @param[in] STRIDE_Y The stride value in y-axis direction
414 * @param[in] Z The offset in z-axis direction
415 * @{
416 */
417#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
418 VLOAD_PARTIAL(N0, LOAD_N0) \
419 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
420
421#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
422 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
423 VLOAD_PARTIAL(N0, LOAD_N0) \
424 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
425
426#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
427 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
428 VLOAD_PARTIAL(N0, LOAD_N0) \
429 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
430
431#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
432 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
433 VLOAD_PARTIAL(N0, LOAD_N0) \
434 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
435
436#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
437 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
438 VLOAD_PARTIAL(N0, LOAD_N0) \
439 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
440
441#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
442 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
443 VLOAD_PARTIAL(N0, LOAD_N0) \
444 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
445
446#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
447 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
448 VLOAD_PARTIAL(N0, LOAD_N0) \
449 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
450
451#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
452 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
453 VLOAD_PARTIAL(N0, LOAD_N0) \
454 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
455
456#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
457 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
458 VLOAD_PARTIAL(N0, LOAD_N0) \
459 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
460
461#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
462 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
463 VLOAD_PARTIAL(N0, LOAD_N0) \
464 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
465
466#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
467 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
468 VLOAD_PARTIAL(N0, LOAD_N0) \
469 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
470
471#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
472 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
473 VLOAD_PARTIAL(N0, LOAD_N0) \
474 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
475
476#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
477 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
478 VLOAD_PARTIAL(N0, LOAD_N0) \
479 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
480
481#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
482 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
483 VLOAD_PARTIAL(N0, LOAD_N0) \
484 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
485
486#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
487 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
488 VLOAD_PARTIAL(N0, LOAD_N0) \
489 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
490
491#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
492 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
493 VLOAD_PARTIAL(N0, LOAD_N0) \
494 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
495/** @} */ // end of groupd LOAD_ROW_PARTIAL_n
496
497/** Partially load a block of the given size LOAD_M0xLOAD_N0
498 * @name LOAD_BLOCK_PARTIAL
499 *
500 * @note The vector width @p N0 is also required for correct partial storing behaviour.
501 * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
502 *
503 * The data to load is expected to have consecutive names for each row.
504 * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2.
505 * The Z offset is expected to have consecutive names.
506 * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
507 *
508 * @param[in] LOAD_M0 The number of rows to load. Supported: 1-16
509 * @param[in] LOAD_N0 The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0
510 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16
511 * @param[in] DATA_TYPE The data type of the vectors
512 * @param[in] BASENAME The basename of the variables
513 * @param[in] PTR The base pointer
514 * @param[in] OFFSET The offset within a row
515 * @param[in] STRIDE_Y The stride value in y-axis direction
516 * @param[in] Z The offset in z-axis direction
517 * @{
518 */
519#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
520#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
521/** Load a block that can be partial in both x and y dimensions
522 *
523 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
524 *
525 * The data to load is expected to have consecutive names for each row.
526 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
527 * The Z offset is expected to have consecutive names.
528 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
529 *
530 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
531 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
532 * @param[in] DATA_TYPE The data type of the vectors
533 * @param[in] BASENAME The basename of the variables
534 * @param[in] PTR The base pointer
535 * @param[in] OFFSET The offset within a row
536 * @param[in] STRIDE_Y The stride value in y-axis direction
537 * @param[in] Z The offset in z-axis direction
538 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
539 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
540 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
541 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
542 */
543#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
544 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
545 { \
546 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
547 } \
548 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
549 { \
550 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
551 } \
552 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
553 { \
554 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
555 } \
556 else \
557 { \
558 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
559 }
560/** Load a block that can only be partial in x but not y.
561 *
562 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
563 *
564 * The data to load is expected to have consecutive names for each row.
565 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
566 * The Z offset is expected to have consecutive names.
567 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
568 *
569 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
570 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
571 * @param[in] DATA_TYPE The data type of the vectors
572 * @param[in] BASENAME The basename of the variables
573 * @param[in] PTR The base pointer
574 * @param[in] OFFSET The offset within a row
575 * @param[in] STRIDE_Y The stride value in y-axis direction
576 * @param[in] Z The offset in z-axis direction
577 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
578 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
579 */
580#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
581 if(!(PARTIAL_COND_X)) \
582 { \
583 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
584 } \
585 else \
586 { \
587 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
588 }
589/** Load a block that can only be partial in y but not x.
590 *
591 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
592 *
593 * The data to store is expected to have consecutive names for each row.
594 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
595 * The Z offset is expected to have consecutive names.
596 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
597 *
598 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
599 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
600 * @param[in] DATA_TYPE The data type of the vectors
601 * @param[in] BASENAME The basename of the variables
602 * @param[in] PTR The base pointer
603 * @param[in] OFFSET The offset within a row
604 * @param[in] STRIDE_Y The stride value in y-axis direction
605 * @param[in] Z The offset in z-axis direction
606 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
607 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
608 */
609#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
610 if(!(PARTIAL_COND_Y)) \
611 { \
612 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
613 } \
614 else \
615 { \
616 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
617 }
618/** @} */ // end of group LOAD_BLOCK_PARTIAL
619/** Boundary-aware GeMM block load
620 * @name LOAD_BLOCK_BOUNDARY_AWARE
621 * This macro assumes the following schemes to achieve boundary-awareness:
622 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
623 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
624 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
625 * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim.
626 *
627 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
628 * blocks **at the end**.
629 * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
630 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
631 *
632 * *--x--> x == 0 x == 1
633 * | |<------------------------------N-------------------------->|
634 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
635 * | -------------#############################################################
636 * * | | |...............................|...........................|
637 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
638 * | | |...............................|...........................|
639 * M --#############################################################
640 * | | | |...........................|
641 * y == 1 | M0 | Non-boundary block |....Boundary block in x....|
642 * | | | |...........................|
643 * |------------#############################################################
644 *
645 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0
646 *
647 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
648 *
649 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
650 * and select corresponding load methods such that the boundary detection logic is only added when needed.
651 *
652 * The data to load is expected to have consecutive names for each row.
653 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
654 * The Z offset is expected to have consecutive names.
655 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
656 *
657 * The macro will result in a declaration of @p M0 vectors of size @p N0 with data
658 * type @p DATA_TYPE containing values partially loaded from the specified
659 * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be
660 * filled with zeros.
661 *
662 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
663 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
664 * @param[in] DATA_TYPE The data type of the vectors
665 * @param[in] BASENAME The basename of the variables
666 * @param[in] PTR The base pointer
667 * @param[in] OFFSET The offset within a row
668 * @param[in] STRIDE_Y The stride value in y-axis direction
669 * @param[in] Z The offset in z-axis direction
670 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
671 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
672 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
673 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
674 * @{
675 */
676#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
677// Case1: No partial blocks in either x or y
678#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
679 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
680
681#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
682// Case2: Partial blocks in y
683#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
684 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
685 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
686
687#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
688// Case3: Partial blocks in x
689#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
690 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
691 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
692
693#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
694// Case4: Partial blocks in both x and y
695#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
696 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
697 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
698
699#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
700
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +0100701/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
702 * @name LOAD_TEXTURE2D_ROW_n
703 *
704 * @param[in] N0 The number of pixels to read
705 * @param[in] DATA_TYPE The data type of variables
706 * @param[in] BASENAME The basename of the destination variables for the loaded rows
707 * @param[in] IMG The 2D OpenCL image object
708 * @param[in] X_COORD The x coordinate for the top-left pixel
709 * @param[in] Y_COORD The y coordinate for the top-left pixel
710 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
711 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
712 * @{
713 */
714#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
715 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
716
717#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
718 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
719 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
720
721#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
722 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
723 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
724
725#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
726 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
727 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
728
729#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
730 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
731 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
732
733#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
734 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
735 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
736
737#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
738 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
739 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
740
741#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
742 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
743 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
744
745#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
746 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
747 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
748
749#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
750 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
751 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
752
753#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
754 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
755 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
756
757#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
758 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
759 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
760
761#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
762 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
763 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
764
765#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
766 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
767 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
768
769#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
770 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
771 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
772
773#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
774 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
775 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
776/** @} */ // end of group LOAD_TEXTURE2D_ROW_n
777
778/** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values
779 * @name LOAD_TEXTURE2D
780 *
781 * Supported cases are M0=1,2,3,...,16 and N0=1
782 * The data to load is expected to have consecutive names for each row.
783 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
784 *
785 * @param[in] M0 The number of consecutive rows
786 * @param[in] N0 The number of consecutive pixels. Only 1, 2 and 4 are supported
787 * @param[in] DATA_TYPE The data type of the target
788 * @param[in] BASENAME The basename of the result variables
789 * @param[in] IMG The 2D OpenCL image object
790 * @param[in] X_COORD The x coordinate for the top-left pixel
791 * @param[in] Y_COORD The y coordinate for the top-left pixel
792 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
793 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
794 * @{
795 */
796#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
797#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
798/** @} */ // end of group LOAD_TEXTURE2D
799
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +0000800/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
801 * @name LOAD_ROW_INDIRECT_n
802 *
803 * @param[in] N0 The number of columns to load
804 * @param[in] DATA_TYPE The data type of variables
805 * @param[in] BASENAME The basename of the destination variables for the loaded rows
806 * @param[in] PTR The base pointer
807 * @param[in] OFFSET The offset within a row
808 * @param[in] STRIDE_Y The stride value in y-axis direction
809 * @param[in] Y The y-axis offset vector
810 * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0
811 * @{
812 */
813#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
814 VEC_DATA_TYPE(DATA_TYPE, N0) \
815 BASENAME##0; \
816 if(Y_MASK##0 != 0) \
817 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
818 else \
819 BASENAME##0 = 0;
820
821#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
822 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
823 VEC_DATA_TYPE(DATA_TYPE, N0) \
824 BASENAME##1; \
825 if(Y_MASK##1 != 0) \
826 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
827 else \
828 BASENAME##1 = 0;
829
830#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
831 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
832 VEC_DATA_TYPE(DATA_TYPE, N0) \
833 BASENAME##2; \
834 if(Y_MASK##2 != 0) \
835 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
836 else \
837 BASENAME##2 = 0;
838
839#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
840 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
841 VEC_DATA_TYPE(DATA_TYPE, N0) \
842 BASENAME##3; \
843 if(Y_MASK##3 != 0) \
844 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
845 else \
846 BASENAME##3 = 0;
847
848#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
849 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
850 VEC_DATA_TYPE(DATA_TYPE, N0) \
851 BASENAME##4; \
852 if(Y_MASK##4 != 0) \
853 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
854 else \
855 BASENAME##4 = 0;
856
857#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
858 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
859 VEC_DATA_TYPE(DATA_TYPE, N0) \
860 BASENAME##5; \
861 if(Y_MASK##5 != 0) \
862 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
863 else \
864 BASENAME##5 = 0;
865
866#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
867 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
868 VEC_DATA_TYPE(DATA_TYPE, N0) \
869 BASENAME##6; \
870 if(Y_MASK##6 != 0) \
871 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
872 else \
873 BASENAME##6 = 0;
874
875#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
876 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
877 VEC_DATA_TYPE(DATA_TYPE, N0) \
878 BASENAME##7; \
879 if(Y_MASK##7 != 0) \
880 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
881 else \
882 BASENAME##7 = 0;
883
884#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
885 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
886 VEC_DATA_TYPE(DATA_TYPE, N0) \
887 BASENAME##8; \
888 if(Y_MASK##8 != 0) \
889 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
890 else \
891 BASENAME##8 = 0;
892
893#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
894 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
895 VEC_DATA_TYPE(DATA_TYPE, N0) \
896 BASENAME##9; \
897 if(Y_MASK##9 != 0) \
898 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
899 else \
900 BASENAME##9 = 0;
901
902#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
903 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
904 VEC_DATA_TYPE(DATA_TYPE, N0) \
905 BASENAME##A; \
906 if(Y_MASK##A != 0) \
907 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
908 else \
909 BASENAME##A = 0;
910
911#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
912 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
913 VEC_DATA_TYPE(DATA_TYPE, N0) \
914 BASENAME##B; \
915 if(Y_MASK##B != 0) \
916 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
917 else \
918 BASENAME##B = 0;
919
920#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
921 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
922 VEC_DATA_TYPE(DATA_TYPE, N0) \
923 BASENAME##C; \
924 if(Y_MASK##C != 0) \
925 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
926 else \
927 BASENAME##C = 0;
928
929#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
930 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
931 VEC_DATA_TYPE(DATA_TYPE, N0) \
932 BASENAME##D; \
933 if(Y_MASK##D != 0) \
934 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
935 else \
936 BASENAME##D = 0;
937
938#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
939 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
940 VEC_DATA_TYPE(DATA_TYPE, N0) \
941 BASENAME##E; \
942 if(Y_MASK##E != 0) \
943 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
944 else \
945 BASENAME##E = 0;
946
947#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
948 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
949 VEC_DATA_TYPE(DATA_TYPE, N0) \
950 BASENAME##F; \
951 if(Y_MASK##F != 0) \
952 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
953 else \
954 BASENAME##F = 0;
955
956/** Load blocks (consecutive rows and columns) with Y offset.
957 * @name LOAD_BLOCK_INDIRECT
958 *
959 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
960 * The data to load is expected to have consecutive names for each row.
961 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
962 * The Z offset is expected to have consecutive names.
963 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
964 *
965 * @param[in] M0 The number of consecutive rows
966 * @param[in] N0 The number of consecutive columns
967 * @param[in] DATA_TYPE The data type of the target
968 * @param[in] BASENAME The basename of the result variables
969 * @param[in] PTR The base pointer for the data
970 * @param[in] OFFSET The offset within a row
971 * @param[in] STRIDE_Y The stride in y-axis direction
972 * @param[in] Y The y-axis offset vector
973 * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0
974 * @{
975 */
976#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
977#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
978
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000979/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
980 * @name LOAD_ELEMENT_n
981 *
982 * @param[in] N0 The number of rows to load
983 * @param[in] DATA_TYPE The data type of variables
984 * @param[in] BASENAME The basename of the destination variables for the loaded rows
985 * @param[in] PTR The base pointer
986 * @param[in] OFFSET The offset within a row
987 * @param[in] STRIDE_Y The stride value in y-axis direction
988 * @{
989 */
990#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
991 VEC_DATA_TYPE(DATA_TYPE, N0) \
992 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
993
994#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
995 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
996 VEC_DATA_TYPE(DATA_TYPE, N0) \
997 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
998
999#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1000 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1001 VEC_DATA_TYPE(DATA_TYPE, N0) \
1002 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
1003
1004#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1005 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1006 VEC_DATA_TYPE(DATA_TYPE, N0) \
1007 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
1008
1009#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1010 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1011 VEC_DATA_TYPE(DATA_TYPE, N0) \
1012 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
1013
1014#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1015 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1016 VEC_DATA_TYPE(DATA_TYPE, N0) \
1017 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
1018
1019#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1020 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1021 VEC_DATA_TYPE(DATA_TYPE, N0) \
1022 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
1023
1024#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1025 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1026 VEC_DATA_TYPE(DATA_TYPE, N0) \
1027 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
1028
1029#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1030 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1031 VEC_DATA_TYPE(DATA_TYPE, N0) \
1032 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
1033
1034#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1035 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1036 VEC_DATA_TYPE(DATA_TYPE, N0) \
1037 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
1038
1039#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1040 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1041 VEC_DATA_TYPE(DATA_TYPE, N0) \
1042 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
1043
1044#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1045 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1046 VEC_DATA_TYPE(DATA_TYPE, N0) \
1047 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
1048
1049#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1050 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1051 VEC_DATA_TYPE(DATA_TYPE, N0) \
1052 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
1053
1054#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1055 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1056 VEC_DATA_TYPE(DATA_TYPE, N0) \
1057 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
1058
1059#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1060 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1061 VEC_DATA_TYPE(DATA_TYPE, N0) \
1062 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
1063
1064#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1065 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1066 VEC_DATA_TYPE(DATA_TYPE, N0) \
1067 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
1068
1069/** @}*/ // end of group LOAD_ELEMENT_n
1070
1071/** Load Scalar as Vector (consecutive elements).
1072 * @name LOAD_SCALAR_AS_VECTOR
1073 *
1074 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
1075 * The data to load is expected to have consecutive names for each row.
1076 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
1077 *
1078 * @param[in] M0 The number of consecutive rows
1079 * @param[in] N0 The number of consecutive columns
1080 * @param[in] DATA_TYPE The data type of the target
1081 * @param[in] BASENAME The basename of the result variables
1082 * @param[in] PTR The base pointer for the data
1083 * @param[in] OFFSET The offset within a row
1084 * @param[in] STRIDE_Y The stride in y-axis direction
1085 * @{
1086 */
1087#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1088#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1089/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
1090
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001091/** Basic macros to calculate Z offset values from Z0 to Zn-1
1092 * @name CALCULATE_Z_OFFSET_n
1093 *
1094 * @param[in] M0 The number of offset values to calculate
1095 * @param[in] DATA_TYPE The data type of the results
1096 * @param[in] Z The basename of the result variables
1097 * @param[in] Y The work-itme ID of y-axis
1098 * @param[in] HEIGHT_GEMM3D The height of GEMM3D
1099 * @param[in] DEPTH_GEMM3D The depth of GEMM3D
1100 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1101 * @param[in] STRIDE_Y The stride value in y-axis direction
1102 *
1103 * @{
1104 */
Usama Arif0681e3b2019-04-25 14:28:07 +01001105#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001106 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001107 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001108 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
1109
1110#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1111 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001112 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001113 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001114 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
1115
1116#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1117 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001118 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001119 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001120 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
1121
1122#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1123 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001124 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001125 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001126 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
1127
1128#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1129 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001130 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001131 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001132 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
1133
1134#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1135 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001136 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001137 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001138 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
1139
1140#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1141 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001142 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001143 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001144 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
1145
1146#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1147 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodiceff1fe3e2021-01-02 09:58:51 +00001148 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001149 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \
Usama Arif0681e3b2019-04-25 14:28:07 +01001150 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
1151
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001152/** @} */ // end of group CALCULATE_Z_OFFSET_n
Usama Arif0681e3b2019-04-25 14:28:07 +01001153
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001154/** Calculate Z offset values from Z0 to Zn-1
1155 * @name CALCULATE_Z_OFFSET
1156 *
1157 * The Z offsets are expected to have consecutive names.
1158 * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
1159 * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
1160 * the possible cross plane paddings in case of the plance changes across the z-dimension.
1161 *
1162 * <!--
1163 * | |
1164 * | plane0 |
1165 * | |
1166 * |__________________|
1167 * |******************|
1168 * | cross_plane_pad |
1169 * |******************|
1170 * | |
1171 * | plane1 |
1172 * | |
1173 * |__________________|
1174 * -->
1175 *
1176 * @param[in] M0 The number of offset values to calculate
1177 * @param[in] DATA_TYPE The data type of the results
1178 * @param[in] Z The basename of the result variables
1179 * @param[in] Y The work-itme ID of y-axis
1180 * @param[in] HEIGHT_GEMM3D The height of GEMM3D
1181 * @param[in] DEPTH_GEMM3D The depth of GEMM3D
1182 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1183 * @param[in] STRIDE_Y The stride value in y-axis direction
1184 * @{
1185 */
1186#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1187#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1188/** @} */ // end of group CALCULATE_Z_OFFSET
1189
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001190/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
1191 * @name SCALE_ROW_n
1192 *
1193 * @param[in] DATA_TYPE The data type of the variables
1194 * @param[in] BASENAME The basename of the variables
1195 * @param[in] SCALE The scale factor
1196 * @{
1197 */
Usama Arif0681e3b2019-04-25 14:28:07 +01001198#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001199 BASENAME##0 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001200
1201#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
1202 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001203 BASENAME##1 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001204
1205#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
1206 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001207 BASENAME##2 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001208
1209#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
1210 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001211 BASENAME##3 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001212
1213#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
1214 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001215 BASENAME##4 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001216
1217#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
1218 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001219 BASENAME##5 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001220
1221#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
1222 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001223 BASENAME##6 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001224
1225#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
1226 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001227 BASENAME##7 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001228
1229#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
1230 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001231 BASENAME##8 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001232
1233#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
1234 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001235 BASENAME##9 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001236
1237#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
1238 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001239 BASENAME##A *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001240
1241#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
1242 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001243 BASENAME##B *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001244
1245#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
1246 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001247 BASENAME##C *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001248
1249#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
1250 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001251 BASENAME##D *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001252
1253#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
1254 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001255 BASENAME##E *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +01001256
1257#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
1258 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001259 BASENAME##F *= (DATA_TYPE)SCALE;
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001260/** @} */ // end of group SCALE_ROW_n
Usama Arif0681e3b2019-04-25 14:28:07 +01001261
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001262/** Scale elements stored in a block (BASENAME)
1263 * @name SCALE_BLOCK
1264 *
1265 * Supported cases are N=1,2,3,...,16
1266 *
1267 * @param[in] N The number of rows in the block
1268 * @param[in] DATA_TYPE The data type of the block
1269 * @param[in] BASENAME The basename of the block
1270 * @param[in] SCALE The scale factor
1271 * @{
Usama Arif0681e3b2019-04-25 14:28:07 +01001272 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001273#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
Usama Arif0681e3b2019-04-25 14:28:07 +01001274#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001275/** @} */ // end of group SCALE_BLOCK
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001276
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001277/** Create a new vector containing the values at the given index for a set of given vectors
1278 * @name COLUMN_VECTORn
1279 *
1280 * @param[in] IDX_COL The index value
1281 * @param[in] BASENAME The basename of the destination vectors
1282 * @param[in] X The basename of the source vectors
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001283 * @param[in] TYPE The data type of the destination vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001284 * @{
1285 */
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001286#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
1287 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
1288#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
1289 VEC_DATA_TYPE(TYPE, 2) \
1290 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
1291#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
1292 VEC_DATA_TYPE(TYPE, 3) \
1293 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
1294#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
1295 VEC_DATA_TYPE(TYPE, 4) \
1296 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
1297#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
1298 VEC_DATA_TYPE(TYPE, 8) \
1299 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
1300#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
1301 VEC_DATA_TYPE(TYPE, 16) \
1302 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001303/** @} */ // end of group COLUMN_VECTORn
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001304
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001305/** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
1306 * @name COLUMN_VECTOR_SCALARn
1307 *
1308 * @param[in] IDX_COL The index value
1309 * @param[in] BASENAME The basename of the destination vectors
1310 * @param[in] X The basename of the source vectors
1311 * @param[in] TYPE The data type of the destination vectors
1312 * @{
1313 */
1314#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
1315 TYPE BASENAME##IDX_COL = (TYPE)((X##0));
1316#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
1317 VEC_DATA_TYPE(TYPE, 2) \
1318 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
1319#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
1320 VEC_DATA_TYPE(TYPE, 3) \
1321 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
1322#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
1323 VEC_DATA_TYPE(TYPE, 4) \
1324 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
1325#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
1326 VEC_DATA_TYPE(TYPE, 8) \
1327 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
1328#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
1329 VEC_DATA_TYPE(TYPE, 16) \
1330 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
1331/** @} */ // end of group COLUMN_VECTORn
1332
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001333/** Create transposed vectors of the given vectors
1334 * @name TRANSPOSE_K0Xn
1335 *
1336 * @param[in] K0 The size of the source vectors
1337 * @param[in] BASENAME The basename of transposed vectors
SiCongLib5323cf2021-03-04 15:53:31 +00001338 * @param[in] BS The basename of source vectors for transposition
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001339 * @param[in] TYPE The data type of the transposed vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001340 * @{
1341 */
SiCongLib5323cf2021-03-04 15:53:31 +00001342#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
1343 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
1344#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
1345 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \
1346 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
1347#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
1348 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \
1349 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
1350#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
1351 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \
1352 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
1353#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
1354 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \
1355 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \
1356 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \
1357 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \
1358 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
1359#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
1360 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \
1361 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \
1362 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \
1363 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \
1364 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \
1365 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \
1366 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \
1367 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \
1368 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001369
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001370/** @} */ // end of group TRANSPOSE_K0Xn
1371
1372/** Create column vectors to contain the values at the given index for a set of given vectors
1373 *
1374 * @param[in] K0 The number of source vectors
1375 * @param[in] IDX_COL The index value
1376 * @param[in] BASENAME The basename of the destination vectors
SiCongLib5323cf2021-03-04 15:53:31 +00001377 * @param[in] BS The basename of the source vectors
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001378 * @param[in] TYPE The data type of the destination vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001379 */
SiCongLib5323cf2021-03-04 15:53:31 +00001380#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
1381 CONCAT(COLUMN_VECTOR, K0) \
1382 (IDX_COL, BASENAME, BS, TYPE);
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001383
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001384/** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector
1385 *
1386 * @param[in] K0 The number of source vectors
1387 * @param[in] IDX_COL The index value
1388 * @param[in] BASENAME The basename of the destination vectors
SiCongLib5323cf2021-03-04 15:53:31 +00001389 * @param[in] BS The basename of the source vectors
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001390 * @param[in] TYPE The data type of the destination vectors
1391 */
SiCongLib5323cf2021-03-04 15:53:31 +00001392#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
1393 CONCAT(COLUMN_VECTOR_SCALAR, K0) \
1394 (IDX_COL, BASENAME, BS, TYPE);
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001395
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001396/** Create transposed vectors form the given source vectors
1397 *
1398 * @param[in] K0 The size of source vectors
1399 * @param[in] N0 The number of source vectors
1400 * @param[in] BASENAME The basename of transposed vectors
SiCongLib5323cf2021-03-04 15:53:31 +00001401 * @param[in] BS The basename of source vectors for transposition
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001402 * @param[in] TYPE The data type of the transposed vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001403 *
1404 */
SiCongLib5323cf2021-03-04 15:53:31 +00001405#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
1406 CONCAT(TRANSPOSE_K0X, N0) \
1407 (K0, BASENAME, BS, TYPE);
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001408
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001409/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
1410 * @name ADD_ROW_n
1411 *
1412 * @param[in] BASENAME The basename of the destination variables
1413 * @param[in] BIAS The basename of the added variables
1414 * @{
1415 */
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001416#define ADD_ROW_1(BASENAME, BIAS) \
1417 BASENAME##0 += BIAS##0;
1418
1419#define ADD_ROW_2(BASENAME, BIAS) \
1420 ADD_ROW_1(BASENAME, BIAS) \
1421 BASENAME##1 += BIAS##1;
1422
1423#define ADD_ROW_3(BASENAME, BIAS) \
1424 ADD_ROW_2(BASENAME, BIAS) \
1425 BASENAME##2 += BIAS##2;
1426
1427#define ADD_ROW_4(BASENAME, BIAS) \
1428 ADD_ROW_3(BASENAME, BIAS) \
1429 BASENAME##3 += BIAS##3;
1430
1431#define ADD_ROW_5(BASENAME, BIAS) \
1432 ADD_ROW_4(BASENAME, BIAS) \
1433 BASENAME##4 += BIAS##4;
1434
1435#define ADD_ROW_6(BASENAME, BIAS) \
1436 ADD_ROW_5(BASENAME, BIAS) \
1437 BASENAME##5 += BIAS##5;
1438
1439#define ADD_ROW_7(BASENAME, BIAS) \
1440 ADD_ROW_6(BASENAME, BIAS) \
1441 BASENAME##6 += BIAS##6;
1442
1443#define ADD_ROW_8(BASENAME, BIAS) \
1444 ADD_ROW_7(BASENAME, BIAS) \
1445 BASENAME##7 += BIAS##7;
1446
1447#define ADD_ROW_9(BASENAME, BIAS) \
1448 ADD_ROW_8(BASENAME, BIAS) \
1449 BASENAME##8 += BIAS##8;
1450
1451#define ADD_ROW_10(BASENAME, BIAS) \
1452 ADD_ROW_9(BASENAME, BIAS) \
1453 BASENAME##9 += BIAS##9;
1454
1455#define ADD_ROW_11(BASENAME, BIAS) \
1456 ADD_ROW_10(BASENAME, BIAS) \
1457 BASENAME##A += BIAS##A;
1458
1459#define ADD_ROW_12(BASENAME, BIAS) \
1460 ADD_ROW_11(BASENAME, BIAS) \
1461 BASENAME##B += BIAS##B;
1462
1463#define ADD_ROW_13(BASENAME, BIAS) \
1464 ADD_ROW_12(BASENAME, BIAS) \
1465 BASENAME##C += BIAS##C;
1466
1467#define ADD_ROW_14(BASENAME, BIAS) \
1468 ADD_ROW_13(BASENAME, BIAS) \
1469 BASENAME##D += BIAS##D;
1470
1471#define ADD_ROW_15(BASENAME, BIAS) \
1472 ADD_ROW_14(BASENAME, BIAS) \
1473 BASENAME##E += BIAS##E;
1474
1475#define ADD_ROW_16(BASENAME, BIAS) \
1476 ADD_ROW_15(BASENAME, BIAS) \
1477 BASENAME##F += BIAS##F;
1478
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001479/** @} */ // end of group ADD_ROW_n
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001480
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001481/** Add the block (BIAS) to another block (BASENAME)
1482 * @name ADD_BLOCK
1483 *
1484 * Supported cases are N=1,2,3,...,16
1485 *
1486 * @param[in] N The number of vectors in the block
1487 * @param[in] BASENAME The basename of the destination variables
1488 * @param[in] BIAS The basename of the added variables
1489 * @{
1490 */
1491#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
1492#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
1493/** @} */ // end of group ADD_BLOCK
1494
1495/** Broadcast (add single value) to the each element of the destination variables
1496 * @name ADD_ROW_BROADCAST_n
1497 *
1498 * @param[in] BASENAME The basename of the destination variables
1499 * @param[in] BIAS The variable containing the value to add
1500 * @{
1501 */
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001502#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1503 BASENAME##0 += BIAS;
1504
1505#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1506 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1507 BASENAME##1 += BIAS;
1508
1509#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1510 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1511 BASENAME##2 += BIAS;
1512
1513#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1514 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1515 BASENAME##3 += BIAS;
1516
1517#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1518 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1519 BASENAME##4 += BIAS;
1520
1521#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1522 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1523 BASENAME##5 += BIAS;
1524
1525#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1526 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1527 BASENAME##6 += BIAS;
1528
1529#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1530 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1531 BASENAME##7 += BIAS;
1532
1533#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1534 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1535 BASENAME##8 += BIAS;
1536
1537#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1538 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1539 BASENAME##9 += BIAS;
1540
1541#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1542 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1543 BASENAME##A += BIAS;
1544
1545#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1546 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1547 BASENAME##B += BIAS;
1548
1549#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1550 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1551 BASENAME##C += BIAS;
1552
1553#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1554 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1555 BASENAME##D += BIAS;
1556
1557#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1558 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1559 BASENAME##E += BIAS;
1560
1561#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
1562 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1563 BASENAME##F += BIAS;
1564
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001565/** Broadcast (add a value) to the each element of the destination block (BASENAME)
1566 * @name ADD_BLOCK_BROADCAST
1567 *
1568 * Supported cases are N=1,2,3,...,16.
1569 *
1570 * @param[in] N The number of vectors in the block
1571 * @param[in] BASENAME The basename of the destination variables
1572 * @param[in] BIAS The variable containing the value to add
1573 * @{
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001574 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001575#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001576#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001577/** @} */ // end of group ADD_BLOCK_BROADCAST
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001578
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001579/** Apply activation to the given variables
1580 * @name ACTIVATION_ROW_n
1581 *
1582 * @param[in] ACTIVATION_TYPE The type of the activation
1583 * @param[in] DATA_TYPE The data type of the vectors
1584 * @param[in] BASENAME The basename of the variables
1585 * @param[in] A_VAL Additional value required by the activation
1586 * @param[in] B_VAL Additional value required by the activation
1587 * @{
1588 */
Giorgio Arenad056e572020-10-12 11:53:51 +01001589#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1590 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001591
Giorgio Arenad056e572020-10-12 11:53:51 +01001592#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1593 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1594 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001595
Giorgio Arenad056e572020-10-12 11:53:51 +01001596#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1597 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1598 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001599
Giorgio Arenad056e572020-10-12 11:53:51 +01001600#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1601 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1602 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001603
Giorgio Arenad056e572020-10-12 11:53:51 +01001604#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1605 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1606 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001607
Giorgio Arenad056e572020-10-12 11:53:51 +01001608#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1609 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1610 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001611
Giorgio Arenad056e572020-10-12 11:53:51 +01001612#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1613 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1614 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001615
Giorgio Arenad056e572020-10-12 11:53:51 +01001616#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1617 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1618 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001619
Giorgio Arenad056e572020-10-12 11:53:51 +01001620#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1621 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1622 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001623
Giorgio Arenad056e572020-10-12 11:53:51 +01001624#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1625 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1626 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001627
Giorgio Arenad056e572020-10-12 11:53:51 +01001628#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1629 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1630 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001631
Giorgio Arenad056e572020-10-12 11:53:51 +01001632#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1633 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1634 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001635
Giorgio Arenad056e572020-10-12 11:53:51 +01001636#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1637 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1638 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001639
Giorgio Arenad056e572020-10-12 11:53:51 +01001640#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1641 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1642 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001643
Giorgio Arenad056e572020-10-12 11:53:51 +01001644#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1645 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1646 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001647
Giorgio Arenad056e572020-10-12 11:53:51 +01001648#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1649 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1650 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001651/** @} */ // end of group ACTIVATION_ROW_n
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001652
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001653/** Apply activation to a block (BASENAME)
1654 * @name ACTIVATION_BLOCK
1655 *
1656 * Supported cases are N=1,2,3,...,16.
1657 *
1658 * @param[in] N The number of vectors in the block
1659 * @param[in] ACTIVATION_TYPE The type of the activation
1660 * @param[in] DATA_TYPE The data type of the vectors
1661 * @param[in] BASENAME The basename of the variables
1662 * @param[in] A_VAL Additional value required by the activation
1663 * @param[in] B_VAL Additional value required by the activation
1664 * @{
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001665 */
Giorgio Arenad056e572020-10-12 11:53:51 +01001666#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
1667#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001668/** @} */ // end of group ACTIVATION_BLOCK
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001669
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001670/** Apply convert_<data_type> to the given variables
1671 * @name CONVERT_ROW_n
1672 *
1673 * @param[in] N The size of the vectors
1674 * @param[in] DATA_TYPE The data type of the vectors
1675 * @param[in] BASENAME_SRC The basename of the source variables
1676 * @param[in] BASENAME_DST The basename of the destination variables
1677 */
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001678#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1679 VEC_DATA_TYPE(DATA_TYPE, N) \
1680 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
1681
1682#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1683 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1684 VEC_DATA_TYPE(DATA_TYPE, N) \
1685 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
1686
1687#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1688 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1689 VEC_DATA_TYPE(DATA_TYPE, N) \
1690 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
1691
1692#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1693 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1694 VEC_DATA_TYPE(DATA_TYPE, N) \
1695 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
1696
1697#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1698 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1699 VEC_DATA_TYPE(DATA_TYPE, N) \
1700 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
1701
1702#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1703 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1704 VEC_DATA_TYPE(DATA_TYPE, N) \
1705 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
1706
1707#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1708 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1709 VEC_DATA_TYPE(DATA_TYPE, N) \
1710 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
1711
1712#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1713 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1714 VEC_DATA_TYPE(DATA_TYPE, N) \
1715 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
1716
1717#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1718 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1719 VEC_DATA_TYPE(DATA_TYPE, N) \
1720 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
1721
1722#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1723 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1724 VEC_DATA_TYPE(DATA_TYPE, N) \
1725 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
1726
1727#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1728 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1729 VEC_DATA_TYPE(DATA_TYPE, N) \
1730 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
1731
1732#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1733 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1734 VEC_DATA_TYPE(DATA_TYPE, N) \
1735 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
1736
1737#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1738 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1739 VEC_DATA_TYPE(DATA_TYPE, N) \
1740 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
1741
1742#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1743 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1744 VEC_DATA_TYPE(DATA_TYPE, N) \
1745 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
1746
1747#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1748 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1749 VEC_DATA_TYPE(DATA_TYPE, N) \
1750 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
1751
1752#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1753 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1754 VEC_DATA_TYPE(DATA_TYPE, N) \
1755 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001756/** @} */ // end of group CONVERT_ROW_n
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001757
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001758/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
1759 * @name CONVERT_BLOCK
1760 *
1761 * Supported cases N=1,2,3,...,16.
1762 *
1763 * @param[in] M The number of vectors to convert
1764 * @param[in] N The size of the vectors
1765 * @param[in] DATA_TYPE The data type of the vectors
1766 * @param[in] BASENAME_SRC The basename of the source variables
1767 * @param[in] BASENAME_DST The basename of the destination variables
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001768 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001769#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1770#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
Giorgio Arenad304adb2020-10-02 10:20:11 +01001771/** @} */ // end of group CONVERT_BLOCK