blob: 5ada788d4960b5519d1c891d5aac5763a99e010e [file] [log] [blame]
Usama Arif0681e3b2019-04-25 14:28:07 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2019-2020 Arm Limited.
Usama Arif0681e3b2019-04-25 14:28:07 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +010024#include "activation_float_helpers.h"
Usama Arif0681e3b2019-04-25 14:28:07 +010025#include "helpers.h"
26
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +000027/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
28 * @name LOAD_ROW_n
29 *
30 * @param[in] N0 The number of rows to load
31 * @param[in] DATA_TYPE The data type of variables
32 * @param[in] BASENAME The basename of the destination variables for the loaded rows
33 * @param[in] PTR The base pointer
34 * @param[in] OFFSET The offset within a row
35 * @param[in] STRIDE_Y The stride value in y-axis direction
36 * @param[in] Z The z-axis offset vector
37 * @{
38 */
Usama Arif0681e3b2019-04-25 14:28:07 +010039#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
40 VEC_DATA_TYPE(DATA_TYPE, N0) \
41 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
42
43#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
44 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
45 VEC_DATA_TYPE(DATA_TYPE, N0) \
46 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
47
48#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
49 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
50 VEC_DATA_TYPE(DATA_TYPE, N0) \
51 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
52
53#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
54 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
55 VEC_DATA_TYPE(DATA_TYPE, N0) \
56 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
57
58#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
59 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
60 VEC_DATA_TYPE(DATA_TYPE, N0) \
61 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
62
63#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
64 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
65 VEC_DATA_TYPE(DATA_TYPE, N0) \
66 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
67
68#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
69 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
70 VEC_DATA_TYPE(DATA_TYPE, N0) \
71 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
72
73#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
74 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
75 VEC_DATA_TYPE(DATA_TYPE, N0) \
76 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
77
78#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
79 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
80 VEC_DATA_TYPE(DATA_TYPE, N0) \
81 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
82
83#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
84 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
85 VEC_DATA_TYPE(DATA_TYPE, N0) \
86 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
87
88#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
89 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
90 VEC_DATA_TYPE(DATA_TYPE, N0) \
91 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
92
93#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
94 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
95 VEC_DATA_TYPE(DATA_TYPE, N0) \
96 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
97
98#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
99 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
100 VEC_DATA_TYPE(DATA_TYPE, N0) \
101 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
102
103#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
104 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
105 VEC_DATA_TYPE(DATA_TYPE, N0) \
106 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
107
108#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
109 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
110 VEC_DATA_TYPE(DATA_TYPE, N0) \
111 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
112
113#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
114 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
115 VEC_DATA_TYPE(DATA_TYPE, N0) \
116 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
117
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000118/** @}*/ // end of group LOAD_ROW_n
Usama Arif0681e3b2019-04-25 14:28:07 +0100119
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000120/** Load Blocks (consecutive rows and columns) with Z offset.
121 * @name LOAD_BLOCK
122 *
123 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
124 * The data to load is expected to have consecutive names for each row.
125 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
126 * The Z offset is expected to have consecutive names.
127 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
128 *
129 * @param[in] M0 The number of consecutive rows
130 * @param[in] N0 The number of consecutive columns
131 * @param[in] DATA_TYPE The data type of the target
132 * @param[in] BASENAME The basename of the result variables
133 * @param[in] PTR The base pointer for the data
134 * @param[in] OFFSET The offset within a row
135 * @param[in] STRIDE_Y The stride in y-axis direction
136 * @param[in] Z The z-axis offset vector
137 * @{
138 */
139#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
140#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
141/** @} */ // end of group LOAD_BLOCK
142
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +0100143/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
144 * @name LOAD_TEXTURE2D_ROW_n
145 *
146 * @param[in] N0 The number of pixels to read
147 * @param[in] DATA_TYPE The data type of variables
148 * @param[in] BASENAME The basename of the destination variables for the loaded rows
149 * @param[in] IMG The 2D OpenCL image object
150 * @param[in] X_COORD The x coordinate for the top-left pixel
151 * @param[in] Y_COORD The y coordinate for the top-left pixel
152 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
153 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
154 * @{
155 */
156#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
157 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
158
159#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
160 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
161 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
162
163#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
164 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
165 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
166
167#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
168 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
169 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
170
171#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
172 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
173 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
174
175#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
176 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
177 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
178
179#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
180 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
181 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
182
183#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
184 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
185 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
186
187#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
188 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
189 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
190
191#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
192 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
193 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
194
195#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
196 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
197 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
198
199#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
200 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
201 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
202
203#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
204 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
205 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
206
207#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
208 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
209 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
210
211#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
212 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
213 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
214
215#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
216 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
217 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
218/** @} */ // end of group LOAD_TEXTURE2D_ROW_n
219
220/** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values
221 * @name LOAD_TEXTURE2D
222 *
223 * Supported cases are M0=1,2,3,...,16 and N0=1
224 * The data to load is expected to have consecutive names for each row.
225 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
226 *
227 * @param[in] M0 The number of consecutive rows
228 * @param[in] N0 The number of consecutive pixels. Only 1, 2 and 4 are supported
229 * @param[in] DATA_TYPE The data type of the target
230 * @param[in] BASENAME The basename of the result variables
231 * @param[in] IMG The 2D OpenCL image object
232 * @param[in] X_COORD The x coordinate for the top-left pixel
233 * @param[in] Y_COORD The y coordinate for the top-left pixel
234 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
235 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
236 * @{
237 */
238#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
239#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
240/** @} */ // end of group LOAD_TEXTURE2D
241
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000242/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
243 * @name LOAD_ELEMENT_n
244 *
245 * @param[in] N0 The number of rows to load
246 * @param[in] DATA_TYPE The data type of variables
247 * @param[in] BASENAME The basename of the destination variables for the loaded rows
248 * @param[in] PTR The base pointer
249 * @param[in] OFFSET The offset within a row
250 * @param[in] STRIDE_Y The stride value in y-axis direction
251 * @{
252 */
253#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
254 VEC_DATA_TYPE(DATA_TYPE, N0) \
255 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
256
257#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
258 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
259 VEC_DATA_TYPE(DATA_TYPE, N0) \
260 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
261
262#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
263 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
264 VEC_DATA_TYPE(DATA_TYPE, N0) \
265 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
266
267#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
268 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
269 VEC_DATA_TYPE(DATA_TYPE, N0) \
270 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
271
272#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
273 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
274 VEC_DATA_TYPE(DATA_TYPE, N0) \
275 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
276
277#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
278 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
279 VEC_DATA_TYPE(DATA_TYPE, N0) \
280 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
281
282#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
283 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
284 VEC_DATA_TYPE(DATA_TYPE, N0) \
285 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
286
287#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
288 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
289 VEC_DATA_TYPE(DATA_TYPE, N0) \
290 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
291
292#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
293 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
294 VEC_DATA_TYPE(DATA_TYPE, N0) \
295 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
296
297#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
298 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
299 VEC_DATA_TYPE(DATA_TYPE, N0) \
300 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
301
302#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
303 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
304 VEC_DATA_TYPE(DATA_TYPE, N0) \
305 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
306
307#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
308 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
309 VEC_DATA_TYPE(DATA_TYPE, N0) \
310 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
311
312#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
313 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
314 VEC_DATA_TYPE(DATA_TYPE, N0) \
315 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
316
317#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
318 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
319 VEC_DATA_TYPE(DATA_TYPE, N0) \
320 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
321
322#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
323 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
324 VEC_DATA_TYPE(DATA_TYPE, N0) \
325 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
326
327#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
328 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
329 VEC_DATA_TYPE(DATA_TYPE, N0) \
330 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
331
332/** @}*/ // end of group LOAD_ELEMENT_n
333
334/** Load Scalar as Vector (consecutive elements).
335 * @name LOAD_SCALAR_AS_VECTOR
336 *
337 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
338 * The data to load is expected to have consecutive names for each row.
339 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
340 *
341 * @param[in] M0 The number of consecutive rows
342 * @param[in] N0 The number of consecutive columns
343 * @param[in] DATA_TYPE The data type of the target
344 * @param[in] BASENAME The basename of the result variables
345 * @param[in] PTR The base pointer for the data
346 * @param[in] OFFSET The offset within a row
347 * @param[in] STRIDE_Y The stride in y-axis direction
348 * @{
349 */
350#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
351#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
352/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
353
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000354/** Basic macros to calculate Z offset values from Z0 to Zn-1
355 * @name CALCULATE_Z_OFFSET_n
356 *
357 * @param[in] M0 The number of offset values to calculate
358 * @param[in] DATA_TYPE The data type of the results
359 * @param[in] Z The basename of the result variables
360 * @param[in] Y The work-itme ID of y-axis
361 * @param[in] HEIGHT_GEMM3D The height of GEMM3D
362 * @param[in] DEPTH_GEMM3D The depth of GEMM3D
363 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
364 * @param[in] STRIDE_Y The stride value in y-axis direction
365 *
366 * @{
367 */
Usama Arif0681e3b2019-04-25 14:28:07 +0100368#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100369 Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
370 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100371 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
372
373#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
374 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100375 Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
376 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100377 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
378
379#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
380 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100381 Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
382 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100383 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
384
385#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
386 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100387 Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
388 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100389 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
390
391#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
392 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100393 Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
394 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100395 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
396
397#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
398 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100399 Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
400 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100401 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
402
403#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
404 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100405 Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
406 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100407 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
408
409#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
410 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100411 Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
412 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \
Usama Arif0681e3b2019-04-25 14:28:07 +0100413 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
414
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000415/** @} */ // end of group CALCULATE_Z_OFFSET_n
Usama Arif0681e3b2019-04-25 14:28:07 +0100416
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000417/** Calculate Z offset values from Z0 to Zn-1
418 * @name CALCULATE_Z_OFFSET
419 *
420 * The Z offsets are expected to have consecutive names.
421 * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
422 * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
423 * the possible cross plane paddings in case of the plance changes across the z-dimension.
424 *
425 * <!--
426 * | |
427 * | plane0 |
428 * | |
429 * |__________________|
430 * |******************|
431 * | cross_plane_pad |
432 * |******************|
433 * | |
434 * | plane1 |
435 * | |
436 * |__________________|
437 * -->
438 *
439 * @param[in] M0 The number of offset values to calculate
440 * @param[in] DATA_TYPE The data type of the results
441 * @param[in] Z The basename of the result variables
442 * @param[in] Y The work-itme ID of y-axis
443 * @param[in] HEIGHT_GEMM3D The height of GEMM3D
444 * @param[in] DEPTH_GEMM3D The depth of GEMM3D
445 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
446 * @param[in] STRIDE_Y The stride value in y-axis direction
447 * @{
448 */
449#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
450#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
451/** @} */ // end of group CALCULATE_Z_OFFSET
452
453/** Store the 0 to (n-1)th rows of the given variables
454 * @name STORE_ROW_n
455 *
SiCong Li3a501662020-06-26 10:02:06 +0100456 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000457 * @param[in] DATA_TYPE The data type of the vectors
458 * @param[in] BASENAME The basename of the variables
459 * @param[in] PTR The base pointer
460 * @param[in] STRIDE_Y The stride value in y-axis direction
461 * @param[in] Z The offset in z-axis direction
462 * @{
463 */
Usama Arif0681e3b2019-04-25 14:28:07 +0100464#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
465 VSTORE(N0) \
466 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
467
468#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
469 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
470 VSTORE(N0) \
471 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
472
473#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
474 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
475 VSTORE(N0) \
476 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
477
478#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
479 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
480 VSTORE(N0) \
481 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
482
483#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
484 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
485 VSTORE(N0) \
486 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
487
488#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
489 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
490 VSTORE(N0) \
491 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
492
493#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
494 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
495 VSTORE(N0) \
496 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
497
498#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
499 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
500 VSTORE(N0) \
501 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
502
503#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
504 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
505 VSTORE(N0) \
506 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
507
508#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
509 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
510 VSTORE(N0) \
511 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
512
513#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
514 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
515 VSTORE(N0) \
516 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
517
518#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
519 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
520 VSTORE(N0) \
521 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
522
523#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
524 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
525 VSTORE(N0) \
526 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
527
528#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
529 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
530 VSTORE(N0) \
531 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
532
533#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
534 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
535 VSTORE(N0) \
536 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
537
538#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
539 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
540 VSTORE(N0) \
541 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000542/** @} */ // end of groupd STORE_ROW_n
Usama Arif0681e3b2019-04-25 14:28:07 +0100543
SiCong Li3a501662020-06-26 10:02:06 +0100544/** Partially store the 0 to (n-1)th rows of the given variables
545 * @name STORE_ROW_PARTIAL_n
546 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
547 *
548 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
549 *
550 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
551 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
552 * @param[in] DATA_TYPE The data type of the vectors
553 * @param[in] BASENAME The basename of the variables
554 * @param[in] PTR The base pointer
555 * @param[in] STRIDE_Y The stride value in y-axis direction
556 * @param[in] Z The offset in z-axis direction
557 * @{
558 */
559#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
560 VSTORE_PARTIAL(N0, STORE_N0) \
561 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
562
563#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
564 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
565 VSTORE_PARTIAL(N0, STORE_N0) \
566 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
567
568#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
569 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
570 VSTORE_PARTIAL(N0, STORE_N0) \
571 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
572
573#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
574 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
575 VSTORE_PARTIAL(N0, STORE_N0) \
576 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
577
578#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
579 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
580 VSTORE_PARTIAL(N0, STORE_N0) \
581 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
582
583#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
584 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
585 VSTORE_PARTIAL(N0, STORE_N0) \
586 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
587
588#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
589 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
590 VSTORE_PARTIAL(N0, STORE_N0) \
591 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
592
593#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
594 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
595 VSTORE_PARTIAL(N0, STORE_N0) \
596 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
597
598#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
599 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
600 VSTORE_PARTIAL(N0, STORE_N0) \
601 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
602
603#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
604 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
605 VSTORE_PARTIAL(N0, STORE_N0) \
606 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
607
608#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
609 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
610 VSTORE_PARTIAL(N0, STORE_N0) \
611 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
612
613#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
614 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
615 VSTORE_PARTIAL(N0, STORE_N0) \
616 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
617
618#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
619 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
620 VSTORE_PARTIAL(N0, STORE_N0) \
621 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
622
623#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
624 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
625 VSTORE_PARTIAL(N0, STORE_N0) \
626 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
627
628#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
629 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
630 VSTORE_PARTIAL(N0, STORE_N0) \
631 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
632
633#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
634 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
635 VSTORE_PARTIAL(N0, STORE_N0) \
636 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
637/** @} */ // end of groupd STORE_ROW_PARTIAL_n
638
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000639/** Convert and store the 0th to (n-1)th rows of the given variables
640 * @name CONVERT_STORE_ROW_n
641 *
642 * @param[in] N0 The size of the vectors
643 * @param[in] DATA_TYPE The data type of the vectors
644 * @param[in] BASENAME The basename of the variables
645 * @param[in] PTR The base pointer
646 * @param[in] STRIDE_Y The stride value in y-axis direction
647 * @param[in] Z The offset in z-axis direction
648 * @{
649 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100650#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
651 VSTORE(N0) \
652 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
653
654#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
655 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
656 VSTORE(N0) \
657 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
658
659#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
660 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
661 VSTORE(N0) \
662 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
663
664#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
665 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
666 VSTORE(N0) \
667 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
668
669#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
670 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
671 VSTORE(N0) \
672 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
673
674#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
675 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
676 VSTORE(N0) \
677 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
678
679#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
680 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
681 VSTORE(N0) \
682 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
683
684#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
685 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
686 VSTORE(N0) \
687 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
688
689#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
690 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
691 VSTORE(N0) \
692 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
693
694#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
695 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
696 VSTORE(N0) \
697 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
698
699#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
700 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
701 VSTORE(N0) \
702 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
703
704#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
705 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
706 VSTORE(N0) \
707 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
708
709#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
710 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
711 VSTORE(N0) \
712 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
713
714#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
715 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
716 VSTORE(N0) \
717 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
718
719#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
720 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
721 VSTORE(N0) \
722 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
723
724#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
725 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
726 VSTORE(N0) \
727 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
728
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000729/** @} */ // end of groupd CONVERT_STORE_ROW_n
730
731/** Store a block of the given size M0xN0
732 * @name STORE_BLOCK
733 *
734 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
735 * The data to store is expected to have consecutive names for each row.
736 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
737 * The Z offset is expected to have consecutive names.
738 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
739 *
740 * @param[in] M0 The number of rows to store
741 * @param[in] N0 The size of each vector
742 * @param[in] DATA_TYPE The data type of the vectors
743 * @param[in] BASENAME The basename of the variables
744 * @param[in] PTR The base pointer
745 * @param[in] STRIDE_Y The stride value in y-axis direction
746 * @param[in] Z The offset in z-axis direction
747 * @{
748 */
Usama Arif0681e3b2019-04-25 14:28:07 +0100749#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
Usama Arif0681e3b2019-04-25 14:28:07 +0100750#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000751/** @} */ // end of group STORE_BLOCK
Usama Arif0681e3b2019-04-25 14:28:07 +0100752
SiCong Li3a501662020-06-26 10:02:06 +0100753/** Partially store a block of the given size STORE_M0xSTORE_N0
754 * @name STORE_BLOCK_PARTIAL
755 *
756 * @note The vector width @p N0 is also required for correct partial storing behaviour.
757 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
758 *
759 * The data to store is expected to have consecutive names for each row.
760 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
761 * The Z offset is expected to have consecutive names.
762 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
763 *
764 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16
765 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
766 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16
767 * @param[in] DATA_TYPE The data type of the vectors
768 * @param[in] BASENAME The basename of the variables
769 * @param[in] PTR The base pointer
770 * @param[in] STRIDE_Y The stride value in y-axis direction
771 * @param[in] Z The offset in z-axis direction
772 * @{
773 */
774#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
775#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
776/** Store a block that can be partial in both x and y dimensions
777 *
778 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
779 *
780 * The data to store is expected to have consecutive names for each row.
781 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
782 * The Z offset is expected to have consecutive names.
783 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
784 *
785 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
786 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
787 * @param[in] DATA_TYPE The data type of the vectors
788 * @param[in] BASENAME The basename of the variables
789 * @param[in] PTR The base pointer
790 * @param[in] STRIDE_Y The stride value in y-axis direction
791 * @param[in] Z The offset in z-axis direction
792 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
793 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
794 * @param[in] M Total number of rows. Used to detect if current block is at the boundary in y.
795 * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x.
796 * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y.
797 * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x.
798 */
799#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
800 bool at_y_boundary = (y + 1) * M0 >= M; \
801 bool at_x_boundary = (x + 1) * N0 >= N; \
802 if(!at_y_boundary && !at_x_boundary) \
803 { \
804 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
805 } \
806 else if(at_y_boundary && !at_x_boundary) \
807 { \
808 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
809 } \
810 else if(!at_y_boundary && at_x_boundary) \
811 { \
812 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
813 } \
814 else \
815 { \
816 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
817 }
818/** Store a block that can only be partial in x but not y.
819 *
820 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
821 *
822 * The data to store is expected to have consecutive names for each row.
823 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
824 * The Z offset is expected to have consecutive names.
825 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
826 *
827 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
828 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
829 * @param[in] DATA_TYPE The data type of the vectors
830 * @param[in] BASENAME The basename of the variables
831 * @param[in] PTR The base pointer
832 * @param[in] STRIDE_Y The stride value in y-axis direction
833 * @param[in] Z The offset in z-axis direction
834 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
835 * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x.
836 * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x.
837 */
838#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x) \
839 bool at_x_boundary = (x + 1) * N0 >= N; \
840 if(!at_x_boundary) \
841 { \
842 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
843 } \
844 else \
845 { \
846 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
847 }
848/** Store a block that can only be partial in y but not x.
849 *
850 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
851 *
852 * The data to store is expected to have consecutive names for each row.
853 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
854 * The Z offset is expected to have consecutive names.
855 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
856 *
857 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
858 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
859 * @param[in] DATA_TYPE The data type of the vectors
860 * @param[in] BASENAME The basename of the variables
861 * @param[in] PTR The base pointer
862 * @param[in] STRIDE_Y The stride value in y-axis direction
863 * @param[in] Z The offset in z-axis direction
864 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
865 * @param[in] M Total number of rows. Used to detect if current block is at the boundary in y.
866 * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y.
867 */
868#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, M, y) \
869 bool at_y_boundary = (y + 1) * M0 >= M; \
870 if(!at_y_boundary) \
871 { \
872 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
873 } \
874 else \
875 { \
876 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
877 }
878/** @} */ // end of group STORE_BLOCK_PARTIAL
879
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000880/** Convert and store a block of the given size M0xN0
881 * @name CONVERT_STORE_BLOCK
882 *
883 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
884 * The data to store is expected to have consecutive names for each row.
885 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
886 * The Z offset is expected to have consecutive names.
887 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
888 *
889 * @param[in] M0 The number of rows to store
890 * @param[in] N0 The size of each vector
891 * @param[in] DATA_TYPE The data type of the vectors
892 * @param[in] BASENAME The basename of the variables
893 * @param[in] PTR The base pointer
894 * @param[in] STRIDE_Y The stride value in y-axis direction
895 * @param[in] Z The offset in z-axis direction
896 * @{
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100897 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000898#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100899#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000900/** @} */ // end of group CONVERT_STORE_BLOCK
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100901
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000902/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
903 * @name SCALE_ROW_n
904 *
905 * @param[in] DATA_TYPE The data type of the variables
906 * @param[in] BASENAME The basename of the variables
907 * @param[in] SCALE The scale factor
908 * @{
909 */
Usama Arif0681e3b2019-04-25 14:28:07 +0100910#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100911 BASENAME##0 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100912
913#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
914 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100915 BASENAME##1 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100916
917#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
918 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100919 BASENAME##2 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100920
921#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
922 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100923 BASENAME##3 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100924
925#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
926 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100927 BASENAME##4 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100928
929#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
930 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100931 BASENAME##5 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100932
933#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
934 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100935 BASENAME##6 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100936
937#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
938 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100939 BASENAME##7 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100940
941#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
942 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100943 BASENAME##8 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100944
945#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
946 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100947 BASENAME##9 *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100948
949#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
950 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100951 BASENAME##A *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100952
953#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
954 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100955 BASENAME##B *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100956
957#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
958 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100959 BASENAME##C *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100960
961#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
962 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100963 BASENAME##D *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100964
965#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
966 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100967 BASENAME##E *= (DATA_TYPE)SCALE;
Usama Arif0681e3b2019-04-25 14:28:07 +0100968
969#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
970 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100971 BASENAME##F *= (DATA_TYPE)SCALE;
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000972/** @} */ // end of group SCALE_ROW_n
Usama Arif0681e3b2019-04-25 14:28:07 +0100973
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000974/** Scale elements stored in a block (BASENAME)
975 * @name SCALE_BLOCK
976 *
977 * Supported cases are N=1,2,3,...,16
978 *
979 * @param[in] N The number of rows in the block
980 * @param[in] DATA_TYPE The data type of the block
981 * @param[in] BASENAME The basename of the block
982 * @param[in] SCALE The scale factor
983 * @{
Usama Arif0681e3b2019-04-25 14:28:07 +0100984 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000985#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
Usama Arif0681e3b2019-04-25 14:28:07 +0100986#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000987/** @} */ // end of group SCALE_BLOCK
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100988
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000989/** Create a new vector containing the values at the given index for a set of given vectors
990 * @name COLUMN_VECTORn
991 *
992 * @param[in] IDX_COL The index value
993 * @param[in] BASENAME The basename of the destination vectors
994 * @param[in] X The basename of the source vectors
Michele Di Giorgiof9179d32019-11-27 16:17:30 +0000995 * @param[in] TYPE The data type of the destination vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +0000996 * @{
997 */
Michele Di Giorgiof9179d32019-11-27 16:17:30 +0000998#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
999 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
1000#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
1001 VEC_DATA_TYPE(TYPE, 2) \
1002 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
1003#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
1004 VEC_DATA_TYPE(TYPE, 3) \
1005 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
1006#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
1007 VEC_DATA_TYPE(TYPE, 4) \
1008 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
1009#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
1010 VEC_DATA_TYPE(TYPE, 8) \
1011 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
1012#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
1013 VEC_DATA_TYPE(TYPE, 16) \
1014 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001015/** @} */ // end of group COLUMN_VECTORn
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001016
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001017/** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
1018 * @name COLUMN_VECTOR_SCALARn
1019 *
1020 * @param[in] IDX_COL The index value
1021 * @param[in] BASENAME The basename of the destination vectors
1022 * @param[in] X The basename of the source vectors
1023 * @param[in] TYPE The data type of the destination vectors
1024 * @{
1025 */
1026#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
1027 TYPE BASENAME##IDX_COL = (TYPE)((X##0));
1028#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
1029 VEC_DATA_TYPE(TYPE, 2) \
1030 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
1031#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
1032 VEC_DATA_TYPE(TYPE, 3) \
1033 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
1034#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
1035 VEC_DATA_TYPE(TYPE, 4) \
1036 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
1037#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
1038 VEC_DATA_TYPE(TYPE, 8) \
1039 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
1040#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
1041 VEC_DATA_TYPE(TYPE, 16) \
1042 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
1043/** @} */ // end of group COLUMN_VECTORn
1044
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001045/** Create transposed vectors of the given vectors
1046 * @name TRANSPOSE_K0Xn
1047 *
1048 * @param[in] K0 The size of the source vectors
1049 * @param[in] BASENAME The basename of transposed vectors
1050 * @param[in] B The basename of source vectors for transposition
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001051 * @param[in] TYPE The data type of the transposed vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001052 * @{
1053 */
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001054#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) \
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001055 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001056#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001057 COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE); \
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001058 COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
1059#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
1060 TRANSPOSE_K0X2(K0, BASENAME, B, TYPE); \
1061 COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
1062#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
1063 TRANSPOSE_K0X3(K0, BASENAME, B, TYPE); \
1064 COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
1065#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
1066 TRANSPOSE_K0X4(K0, BASENAME, B, TYPE); \
1067 COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE); \
1068 COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE); \
1069 COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE); \
1070 COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
1071#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
1072 TRANSPOSE_K0X8(K0, BASENAME, B, TYPE); \
1073 COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE); \
1074 COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE); \
1075 COLUMN_VECTOR(K0, A, BASENAME, B, TYPE); \
1076 COLUMN_VECTOR(K0, B, BASENAME, B, TYPE); \
1077 COLUMN_VECTOR(K0, C, BASENAME, B, TYPE); \
1078 COLUMN_VECTOR(K0, D, BASENAME, B, TYPE); \
1079 COLUMN_VECTOR(K0, E, BASENAME, B, TYPE); \
1080 COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001081
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001082/** @} */ // end of group TRANSPOSE_K0Xn
1083
1084/** Create column vectors to contain the values at the given index for a set of given vectors
1085 *
1086 * @param[in] K0 The number of source vectors
1087 * @param[in] IDX_COL The index value
1088 * @param[in] BASENAME The basename of the destination vectors
1089 * @param[in] B The basename of the source vectors
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001090 * @param[in] TYPE The data type of the destination vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001091 */
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001092#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
1093 CONCAT(COLUMN_VECTOR, K0) \
1094 (IDX_COL, BASENAME, B, TYPE);
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001095
Gian Marco Iodice061eefd2020-04-23 13:40:00 +01001096/** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector
1097 *
1098 * @param[in] K0 The number of source vectors
1099 * @param[in] IDX_COL The index value
1100 * @param[in] BASENAME The basename of the destination vectors
1101 * @param[in] B The basename of the source vectors
1102 * @param[in] TYPE The data type of the destination vectors
1103 */
1104#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
1105 CONCAT(COLUMN_VECTOR_SCALAR, K0) \
1106 (IDX_COL, BASENAME, B, TYPE);
1107
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001108/** Create transposed vectors form the given source vectors
1109 *
1110 * @param[in] K0 The size of source vectors
1111 * @param[in] N0 The number of source vectors
1112 * @param[in] BASENAME The basename of transposed vectors
1113 * @param[in] B The basename of source vectors for transposition
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001114 * @param[in] TYPE The data type of the transposed vectors
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001115 *
1116 */
Michele Di Giorgiof9179d32019-11-27 16:17:30 +00001117#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
1118 CONCAT(TRANSPOSE_K0X, N0) \
1119 (K0, BASENAME, B, TYPE);
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001120
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001121/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
1122 * @name ADD_ROW_n
1123 *
1124 * @param[in] BASENAME The basename of the destination variables
1125 * @param[in] BIAS The basename of the added variables
1126 * @{
1127 */
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001128#define ADD_ROW_1(BASENAME, BIAS) \
1129 BASENAME##0 += BIAS##0;
1130
1131#define ADD_ROW_2(BASENAME, BIAS) \
1132 ADD_ROW_1(BASENAME, BIAS) \
1133 BASENAME##1 += BIAS##1;
1134
1135#define ADD_ROW_3(BASENAME, BIAS) \
1136 ADD_ROW_2(BASENAME, BIAS) \
1137 BASENAME##2 += BIAS##2;
1138
1139#define ADD_ROW_4(BASENAME, BIAS) \
1140 ADD_ROW_3(BASENAME, BIAS) \
1141 BASENAME##3 += BIAS##3;
1142
1143#define ADD_ROW_5(BASENAME, BIAS) \
1144 ADD_ROW_4(BASENAME, BIAS) \
1145 BASENAME##4 += BIAS##4;
1146
1147#define ADD_ROW_6(BASENAME, BIAS) \
1148 ADD_ROW_5(BASENAME, BIAS) \
1149 BASENAME##5 += BIAS##5;
1150
1151#define ADD_ROW_7(BASENAME, BIAS) \
1152 ADD_ROW_6(BASENAME, BIAS) \
1153 BASENAME##6 += BIAS##6;
1154
1155#define ADD_ROW_8(BASENAME, BIAS) \
1156 ADD_ROW_7(BASENAME, BIAS) \
1157 BASENAME##7 += BIAS##7;
1158
1159#define ADD_ROW_9(BASENAME, BIAS) \
1160 ADD_ROW_8(BASENAME, BIAS) \
1161 BASENAME##8 += BIAS##8;
1162
1163#define ADD_ROW_10(BASENAME, BIAS) \
1164 ADD_ROW_9(BASENAME, BIAS) \
1165 BASENAME##9 += BIAS##9;
1166
1167#define ADD_ROW_11(BASENAME, BIAS) \
1168 ADD_ROW_10(BASENAME, BIAS) \
1169 BASENAME##A += BIAS##A;
1170
1171#define ADD_ROW_12(BASENAME, BIAS) \
1172 ADD_ROW_11(BASENAME, BIAS) \
1173 BASENAME##B += BIAS##B;
1174
1175#define ADD_ROW_13(BASENAME, BIAS) \
1176 ADD_ROW_12(BASENAME, BIAS) \
1177 BASENAME##C += BIAS##C;
1178
1179#define ADD_ROW_14(BASENAME, BIAS) \
1180 ADD_ROW_13(BASENAME, BIAS) \
1181 BASENAME##D += BIAS##D;
1182
1183#define ADD_ROW_15(BASENAME, BIAS) \
1184 ADD_ROW_14(BASENAME, BIAS) \
1185 BASENAME##E += BIAS##E;
1186
1187#define ADD_ROW_16(BASENAME, BIAS) \
1188 ADD_ROW_15(BASENAME, BIAS) \
1189 BASENAME##F += BIAS##F;
1190
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001191/** @} */ // end of group ADD_ROW_n
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001192
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001193/** Add the block (BIAS) to another block (BASENAME)
1194 * @name ADD_BLOCK
1195 *
1196 * Supported cases are N=1,2,3,...,16
1197 *
1198 * @param[in] N The number of vectors in the block
1199 * @param[in] BASENAME The basename of the destination variables
1200 * @param[in] BIAS The basename of the added variables
1201 * @{
1202 */
1203#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
1204#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
1205/** @} */ // end of group ADD_BLOCK
1206
1207/** Broadcast (add single value) to the each element of the destination variables
1208 * @name ADD_ROW_BROADCAST_n
1209 *
1210 * @param[in] BASENAME The basename of the destination variables
1211 * @param[in] BIAS The variable containing the value to add
1212 * @{
1213 */
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001214#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1215 BASENAME##0 += BIAS;
1216
1217#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1218 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1219 BASENAME##1 += BIAS;
1220
1221#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1222 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1223 BASENAME##2 += BIAS;
1224
1225#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1226 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1227 BASENAME##3 += BIAS;
1228
1229#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1230 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1231 BASENAME##4 += BIAS;
1232
1233#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1234 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1235 BASENAME##5 += BIAS;
1236
1237#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1238 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1239 BASENAME##6 += BIAS;
1240
1241#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1242 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1243 BASENAME##7 += BIAS;
1244
1245#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1246 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1247 BASENAME##8 += BIAS;
1248
1249#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1250 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1251 BASENAME##9 += BIAS;
1252
1253#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1254 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1255 BASENAME##A += BIAS;
1256
1257#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1258 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1259 BASENAME##B += BIAS;
1260
1261#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1262 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1263 BASENAME##C += BIAS;
1264
1265#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1266 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1267 BASENAME##D += BIAS;
1268
1269#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1270 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1271 BASENAME##E += BIAS;
1272
1273#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
1274 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1275 BASENAME##F += BIAS;
1276
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001277/** Broadcast (add a value) to the each element of the destination block (BASENAME)
1278 * @name ADD_BLOCK_BROADCAST
1279 *
1280 * Supported cases are N=1,2,3,...,16.
1281 *
1282 * @param[in] N The number of vectors in the block
1283 * @param[in] BASENAME The basename of the destination variables
1284 * @param[in] BIAS The variable containing the value to add
1285 * @{
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001286 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001287#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001288#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001289/** @} */ // end of group ADD_BLOCK_BROADCAST
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001290
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001291/** Apply activation to the given variables
1292 * @name ACTIVATION_ROW_n
1293 *
1294 * @param[in] ACTIVATION_TYPE The type of the activation
1295 * @param[in] DATA_TYPE The data type of the vectors
1296 * @param[in] BASENAME The basename of the variables
1297 * @param[in] A_VAL Additional value required by the activation
1298 * @param[in] B_VAL Additional value required by the activation
1299 * @{
1300 */
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001301#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1302 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
1303
1304#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1305 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1306 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
1307
1308#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1309 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1310 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
1311
1312#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1313 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1314 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
1315
1316#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1317 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1318 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
1319
1320#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1321 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1322 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
1323
1324#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1325 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1326 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
1327
1328#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1329 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1330 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
1331
1332#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1333 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1334 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
1335
1336#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1337 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1338 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
1339
1340#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1341 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1342 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
1343
1344#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1345 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1346 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
1347
1348#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1349 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1350 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
1351
1352#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1353 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1354 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
1355
1356#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1357 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1358 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
1359
1360#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1361 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
1362 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001363/** @} */ // end of group ACTIVATION_ROW_n
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001364
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001365/** Apply activation to a block (BASENAME)
1366 * @name ACTIVATION_BLOCK
1367 *
1368 * Supported cases are N=1,2,3,...,16.
1369 *
1370 * @param[in] N The number of vectors in the block
1371 * @param[in] ACTIVATION_TYPE The type of the activation
1372 * @param[in] DATA_TYPE The data type of the vectors
1373 * @param[in] BASENAME The basename of the variables
1374 * @param[in] A_VAL Additional value required by the activation
1375 * @param[in] B_VAL Additional value required by the activation
1376 * @{
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001377 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001378#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001379#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001380/** @} */ // end of group ACTIVATION_BLOCK
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001381
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001382/** Apply convert_<data_type> to the given variables
1383 * @name CONVERT_ROW_n
1384 *
1385 * @param[in] N The size of the vectors
1386 * @param[in] DATA_TYPE The data type of the vectors
1387 * @param[in] BASENAME_SRC The basename of the source variables
1388 * @param[in] BASENAME_DST The basename of the destination variables
1389 */
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001390#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1391 VEC_DATA_TYPE(DATA_TYPE, N) \
1392 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
1393
1394#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1395 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1396 VEC_DATA_TYPE(DATA_TYPE, N) \
1397 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
1398
1399#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1400 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1401 VEC_DATA_TYPE(DATA_TYPE, N) \
1402 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
1403
1404#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1405 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1406 VEC_DATA_TYPE(DATA_TYPE, N) \
1407 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
1408
1409#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1410 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1411 VEC_DATA_TYPE(DATA_TYPE, N) \
1412 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
1413
1414#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1415 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1416 VEC_DATA_TYPE(DATA_TYPE, N) \
1417 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
1418
1419#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1420 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1421 VEC_DATA_TYPE(DATA_TYPE, N) \
1422 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
1423
1424#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1425 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1426 VEC_DATA_TYPE(DATA_TYPE, N) \
1427 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
1428
1429#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1430 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1431 VEC_DATA_TYPE(DATA_TYPE, N) \
1432 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
1433
1434#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1435 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1436 VEC_DATA_TYPE(DATA_TYPE, N) \
1437 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
1438
1439#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1440 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1441 VEC_DATA_TYPE(DATA_TYPE, N) \
1442 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
1443
1444#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1445 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1446 VEC_DATA_TYPE(DATA_TYPE, N) \
1447 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
1448
1449#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1450 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1451 VEC_DATA_TYPE(DATA_TYPE, N) \
1452 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
1453
1454#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1455 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1456 VEC_DATA_TYPE(DATA_TYPE, N) \
1457 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
1458
1459#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1460 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1461 VEC_DATA_TYPE(DATA_TYPE, N) \
1462 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
1463
1464#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1465 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1466 VEC_DATA_TYPE(DATA_TYPE, N) \
1467 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001468/** @} */ // end of group CONVERT_ROW_n
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001469
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001470/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
1471 * @name CONVERT_BLOCK
1472 *
1473 * Supported cases N=1,2,3,...,16.
1474 *
1475 * @param[in] M The number of vectors to convert
1476 * @param[in] N The size of the vectors
1477 * @param[in] DATA_TYPE The data type of the vectors
1478 * @param[in] BASENAME_SRC The basename of the source variables
1479 * @param[in] BASENAME_DST The basename of the destination variables
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01001480 */
Sang-Hoon Park11b0b8a2019-11-05 13:29:19 +00001481#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1482#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
SiCong Li3a501662020-06-26 10:02:06 +01001483/** @} */ // end of group CONVERT_BLOCK
1484
1485#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1486
1487/** Store a block in a boundary-aware way that does not require any padding
1488 * Store a block of the shape M0xN0 in a boundary-aware way that doesn't require any padding for partial blocks
1489 * @name STORE_BLOCK_BOUNDARY_AWARE
1490 *
1491 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
1492 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
1493 *
1494 * *--x--> x == 0 x == 1
1495 * | |<------------------------------N-------------------------->|
1496 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
1497 * | -------------#############################################################
1498 * * | | | |...........................|
1499 * y == 0 | M0 | Non-boundary block |....Boundary block in x....|
1500 * | | | |...........................|
1501 * M --#############################################################
1502 * | | |...............................|...........................|
1503 * y == 1 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
1504 * | | |...............................|...........................|
1505 * |------------#############################################################
1506 *
1507 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0
1508 *
1509 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1510 *
1511 * This method ensures that in the end the dst tensor is stored without requirements for paddings.
1512 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
1513 * and select corresponding store methods such that the boundary detection logic is only added when needed.
1514 *
1515 * The data to store is expected to have consecutive names for each row.
1516 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1517 * The Z offset is expected to have consecutive names.
1518 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1519 *
1520 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
1521 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
1522 * @param[in] DATA_TYPE The data type of the vectors
1523 * @param[in] BASENAME The basename of the variables
1524 * @param[in] PTR The base pointer
1525 * @param[in] STRIDE_Y The stride value in y-axis direction
1526 * @param[in] Z The offset in z-axis direction
1527 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
1528 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
1529 * @param[in] M Total number of rows. Used to detect if current block is at the boundary in y.
1530 * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x.
1531 * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y.
1532 * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x.
1533 * @{
1534 */
1535#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1536// Case1: No partial blocks in either x or y
1537#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
1538 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1539
1540#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1541// Case2: Partial blocks in y
1542#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
1543 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, M, y)
1544
1545#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1546// Case3: Partial blocks in x
1547#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
1548 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x)
1549
1550#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1551// Case4: Partial blocks in both x and y
1552#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
1553 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x)
1554
1555#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1556
1557#else // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1558
1559#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x) \
1560 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1561
1562#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1563/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE