blob: 2e49614f81bf723cf6a1f822d0104ae13b1926d2 [file] [log] [blame]
ramelg019cca5922021-11-11 10:05:00 +00001/*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodice4fb56702021-11-10 11:18:50 +000024#include "helpers.h"
25#include "tile_helpers.h"
ramelg019cca5922021-11-11 10:05:00 +000026#include "gemm_helpers.h"
27#include "repeat.h"
28
29#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
30#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
31#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
32#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
33#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
34#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
35#define CONCAT_INC(K0) INC##K0
36#define INC(K0) CONCAT_INC(K0)
37
38#if(SRC_WIDTH % K0)
39#define BOUNDARY_CONDITION_X(x, a) \
40 ({ \
41 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
42 })
43#else // (SRC_WIDTH % K0)
44#define BOUNDARY_CONDITION_X(x, a) \
45 ({})
46#endif // (SRC_WIDTH % K0)
47
48#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
49 ({ \
50 if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
51 { \
52 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
53 { \
54 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
55 } \
56 else \
57 { \
58 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
59 } \
60 } \
61 else \
62 { \
63 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
64 { \
65 LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
66 } \
67 else \
68 { \
69 LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
70 } \
71 } \
72 })
73
74/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
75 * the output matrix unrolling the values.
76 *
77 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
78 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
79 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
80 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
81 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
82 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
83 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
84 * @note Only the following values for M0, K0 and V0 are supported:
85 * M0: 2,3,4,5,6,7,8
86 * K0: 2,3,4,8,16
87 * V0: greater than 0
88 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
89 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
90 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
91 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
92 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
93 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
94 *
95 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
96 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
97 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
98 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
99 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
100 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
101 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
102 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
103 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
104 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
105 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
106 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
107 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
108 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
109 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
110 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
111 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
112 */
113__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
114 TENSOR3D_DECLARATION(dst)
115#if defined(REINTERPRET_INPUT_AS_3D)
116 ,
117 uint cross_plane_pad
118#endif // REINTERPRET_INPUT_AS_3D
119 )
120{
121 // Block size
122#define BLOCK_SIZE ((M0) * (K0))
123
124 // Output offset X
125#if defined(INTERLEAVE)
126#define OUTPUT_OFFSET_X (K0)
127#else // defined(INTERLEAVE)
128#define OUTPUT_OFFSET_X (BLOCK_SIZE)
129#endif // defined(INTERLEAVE)
130
131 // Output step X
132#if defined(INTERLEAVE)
133#define OUTPUT_STEP_X (K0) * (V0)
134#else // Do not interleave
135#define OUTPUT_STEP_X (K0)
136#endif // defined(INTERLEAVE)
137
138 // Compute source and destination addresses
139 uint x = get_global_id(0);
140 uint y = get_global_id(1);
141 uint z = get_global_id(2);
142
143 // ------------------ Compute input/output addresses ---------------------------
144
145 // Compute the input address
146 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
147
148 // Compute the output address
149 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
150 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
151
152 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
153 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
154
155#if defined(REINTERPRET_INPUT_AS_3D)
156 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
157 // multiply src_stride_z by DEPTH_GEMM3D
158
159 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
160
161 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
162 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
163
164#else // defined(REINTERPRET_INPUT_AS_3D)
165
166 input_ptr += z * (uint)src_stride_z;
167
168#endif // defined(REINTERPRET_INPUT_AS_3D)
169
170 // Add offset for batched GEMM
171 output_ptr += z * (uint)dst_stride_z;
172
173 // ---------------------------Load input values --------------------------------
174 // Load values from the LHS matrix
175 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
176
177 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
178
179 // ---------------------------Store output values ------------------------------
180 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
181 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
182
183#undef BLOCK_SIZE
184#undef OUTPUT_OFFSET_X
185#undef OUTPUT_STEP_X
186}
187
188#if M0 == 2
189#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
190 ({ \
191 VEC_DATA_TYPE(DATA_TYPE, M0) \
192 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
193 VSTORE(M0) \
194 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
195 })
196#elif M0 == 3 // M0 == 3
197#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
198 ({ \
199 VEC_DATA_TYPE(DATA_TYPE, M0) \
200 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
201 VSTORE(M0) \
202 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
203 })
204#elif M0 == 4 // M0 == 4
205#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
206 ({ \
207 VEC_DATA_TYPE(DATA_TYPE, M0) \
208 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
209 VSTORE(M0) \
210 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
211 })
212#elif M0 == 5 // M0 == 5
213#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
214 ({ \
215 VEC_DATA_TYPE(DATA_TYPE, 4) \
216 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
217 DATA_TYPE res1 = a4.s##i; \
218 VSTORE(4) \
219 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
220 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
221 })
222#elif M0 == 6 // M0 == 6
223#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
224 ({ \
225 VEC_DATA_TYPE(DATA_TYPE, 4) \
226 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
227 VEC_DATA_TYPE(DATA_TYPE, 2) \
228 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
229 VSTORE(4) \
230 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
231 VSTORE(2) \
232 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
233 })
234#elif M0 == 7 // M0 == 7
235#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
236 ({ \
237 VEC_DATA_TYPE(DATA_TYPE, 4) \
238 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
239 VEC_DATA_TYPE(DATA_TYPE, 3) \
240 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
241 VSTORE(4) \
242 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
243 VSTORE(3) \
244 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
245 })
246#elif M0 == 8 // M0 == 8
247#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
248 ({ \
249 VEC_DATA_TYPE(DATA_TYPE, M0) \
250 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
251 VSTORE(M0) \
252 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
253 })
254#else // M0 not supported
255#error "M0 value not supported"
256#endif // N0 conditions
257
258/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
259 * the output matrix unrolling the values.
260 *
261 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
262 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
263 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
264 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
265 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
266 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
267 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
268 * @note Only the following values for M0, K0 and V0 are supported:
269 * M0: 2,3,4,5,6,7,8
270 * K0: 2,3,4,8,16
271 * V0: greater than 0
272 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
273 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
274 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
275 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
276 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
277 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
278 *
279 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
280 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
281 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
282 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
283 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
284 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
285 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
286 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
287 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
288 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
289 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
290 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
291 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
292 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
293 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
294 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
295 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
296 */
297__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
298 TENSOR3D_DECLARATION(dst)
299#if defined(REINTERPRET_INPUT_AS_3D)
300 ,
301 uint cross_plane_pad
302#endif // REINTERPRET_INPUT_AS_3D
303 )
304{
305 // Block size
306#define BLOCK_SIZE ((M0) * (K0))
307
308 // Output offset X
309#if defined(INTERLEAVE)
310#define OUTPUT_OFFSET_X (M0)
311#else // defined(INTERLEAVE)
312#define OUTPUT_OFFSET_X (BLOCK_SIZE)
313#endif // defined(INTERLEAVE)
314
315 // Output step X
316#if defined(INTERLEAVE)
317#define OUTPUT_STEP_X (M0) * (V0)
318#else // Do not interleave
319#define OUTPUT_STEP_X (M0)
320#endif // defined(INTERLEAVE)
321
322 // Compute source and destination addresses
323 uint x = get_global_id(0);
324 uint y = get_global_id(1);
325 uint z = get_global_id(2);
326
327 // ------------------ Compute input/output addresses ---------------------------
328
329 // Compute the input address
330 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
331
332 // Compute the output address
333 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
334 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
335
336 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
337 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
338
339#if defined(REINTERPRET_INPUT_AS_3D)
340 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
341 // multiply src_stride_z by DEPTH_GEMM3D
342
343 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
344
345 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
346 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
347
348#else // defined(REINTERPRET_INPUT_AS_3D)
349
350 input_ptr += z * (uint)src_stride_z;
351
352#endif // defined(REINTERPRET_INPUT_AS_3D)
353
354 // Add offset for batched GEMM
355 output_ptr += z * (uint)dst_stride_z;
356
357 // ---------------------------Load input values --------------------------------
358 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
359
360 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
361
362 // ---------------------------Transpose and store block -----------------------
363
364 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
365 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
366#if K0 > 2
367 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
368#endif // K0 > 2
369#if K0 > 3
370 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
371#endif // K0 > 3
372#if K0 > 4
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
374 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
375 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
376 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
377#endif // K0 > 4
378#if K0 > 8
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
383 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
384 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
385 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
386 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
387#endif // K0 > 8
388
389#undef BLOCK_SIZE
390#undef OUTPUT_OFFSET_X
391#undef OUTPUT_STEP_X
392}
393#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
394
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000395#if defined(RESHAPE_RHS_NT)
ramelg019cca5922021-11-11 10:05:00 +0000396/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
397 * the output matrix unrolling the values.
398 *
399 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
ramelg019cca5922021-11-11 10:05:00 +0000400 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
401 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
402 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
403 * @note Only the following values for K0, N0 and H0 are supported:
404 * N0: 2,3,4,8,16
405 * K0: 1,2,3,4,8,16
406 * H0: greater than 0
407 *
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000408 * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
409 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
410 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
411 * @param[in] src_w The size of the width dimension of the source tensor
412 * @param[in] src_h The size of the height dimension of the source tensor
413 * @param[in] src_n The size of the depth dimension of the source tensor
414 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
415 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All
416 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
417 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
418 * @param[in] dst_w The size of the width dimension of the destination tensor
419 * @param[in] dst_h The size of the height dimension of the destination tensor
420 * @param[in] dst_n The size of the depth dimension of the destination tensor
421 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
422 * @param[in] H0 The number of blocks to place on the same row. It must be greater than 0.
ramelg019cca5922021-11-11 10:05:00 +0000423 */
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000424__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
425 TENSOR3D_T(dst, BUFFER),
426 const int H0)
ramelg019cca5922021-11-11 10:05:00 +0000427{
428 // Block size
429#define BLOCK_SIZE ((K0) * (N0))
430
431 // Output offset X
432#if defined(INTERLEAVE)
433#define OUTPUT_OFFSET_X (N0)
434#else // defined(INTERLEAVE)
435#define OUTPUT_OFFSET_X (BLOCK_SIZE)
436#endif // defined(INTERLEAVE)
437
438 // Output step X
439#if defined(INTERLEAVE)
440#define OUTPUT_STEP_X (N0) * (H0)
441#else // Do not interleave
442#define OUTPUT_STEP_X (N0)
443#endif // defined(INTERLEAVE)
444
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000445 const int x = GET_SPATIAL_IDX(0, 1, 0);
446 const int y = GET_SPATIAL_IDX(1, 1, 0);
447 const int z = GET_SPATIAL_IDX(2, 1, 0);
ramelg019cca5922021-11-11 10:05:00 +0000448
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000449 const int xi = x * N0;
450 const int yi = y * K0;
ramelg019cca5922021-11-11 10:05:00 +0000451
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000452 const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
453 const int yo = (x / H0);
ramelg019cca5922021-11-11 10:05:00 +0000454
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000455 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
456 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
ramelg019cca5922021-11-11 10:05:00 +0000457
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000458 TILE(DATA_TYPE, K0, N0, in);
ramelg019cca5922021-11-11 10:05:00 +0000459
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000460 // Initialize the tile to zero
461 for(int i = 0; i < K0; ++i)
462 {
463 in[i].v = 0;
464 }
ramelg019cca5922021-11-11 10:05:00 +0000465
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000466 // Load input tile
467 for(int i = 0; i < K0; ++i)
ramelg019cca5922021-11-11 10:05:00 +0000468 {
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000469 if(yi + i < src_h)
470 {
471 in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
472 }
ramelg019cca5922021-11-11 10:05:00 +0000473 }
ramelg019cca5922021-11-11 10:05:00 +0000474
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000475 TILE(uint, K0, 1, dst_indirect_y);
476 for(int i = 0; i < K0; ++i)
477 {
478 dst_indirect_y[i].v = i;
479 }
480
481 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
ramelg019cca5922021-11-11 10:05:00 +0000482
483#undef BLOCK_SIZE
484#undef OUTPUT_OFFSET_X
485#undef OUTPUT_STEP_X
486}
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000487#endif // defined(RESHAPE_RHS_NT)
ramelg019cca5922021-11-11 10:05:00 +0000488
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000489#if defined(RESHAPE_RHS_T)
ramelg019cca5922021-11-11 10:05:00 +0000490/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
491 * the output matrix unrolling the values.
492 *
493 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
ramelg019cca5922021-11-11 10:05:00 +0000494 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
495 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
496 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
497 * @note The option -DTRANSPOSE must passed at compile time.
498 * @note Only the following values for K0, N0 and H0 are supported:
499 * N0: 2,3,4,8,16
500 * K0: 2,3,4,8,16
501 * H0: greater than 0
502 *
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000503 * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
504 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
505 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
506 * @param[in] src_w The size of the width dimension of the source tensor
507 * @param[in] src_h The size of the height dimension of the source tensor
508 * @param[in] src_n The size of the depth dimension of the source tensor
509 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
510 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All
511 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
512 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
513 * @param[in] dst_w The size of the width dimension of the destination tensor
514 * @param[in] dst_h The size of the height dimension of the destination tensor
515 * @param[in] dst_n The size of the depth dimension of the destination tensor
516 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
517 * @param[in] H0 The number of blocks to place on the same row. It must be greater than 0.
ramelg019cca5922021-11-11 10:05:00 +0000518 */
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000519__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
520 TENSOR3D_T(dst, BUFFER),
521 const int H0)
ramelg019cca5922021-11-11 10:05:00 +0000522{
523 // Block size
524#define BLOCK_SIZE ((K0) * (N0))
525
526 // Output offset X
527#if defined(INTERLEAVE)
528#define OUTPUT_OFFSET_X (K0)
529#else // defined(INTERLEAVE)
530#define OUTPUT_OFFSET_X (BLOCK_SIZE)
531#endif // defined(INTERLEAVE)
532
533 // Output step X
534#if defined(INTERLEAVE)
535#define OUTPUT_STEP_X (K0) * (H0)
536#else // Do not interleave
537#define OUTPUT_STEP_X (K0)
538#endif // defined(INTERLEAVE)
539
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000540 const int x = GET_SPATIAL_IDX(0, 1, 0);
541 const int y = GET_SPATIAL_IDX(1, 1, 0);
542 const int z = GET_SPATIAL_IDX(2, 1, 0);
ramelg019cca5922021-11-11 10:05:00 +0000543
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000544 const int xi = x * N0;
545 const int yi = y * K0;
ramelg019cca5922021-11-11 10:05:00 +0000546
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000547 const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
548 const int yo = (x / H0);
ramelg019cca5922021-11-11 10:05:00 +0000549
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000550 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
551 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
ramelg019cca5922021-11-11 10:05:00 +0000552
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000553 TILE(DATA_TYPE, K0, N0, in);
554 TILE(DATA_TYPE, N0, K0, in_tr);
ramelg019cca5922021-11-11 10:05:00 +0000555
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000556 // Initialize the tile to zero
557 for(int i = 0; i < K0; ++i)
ramelg019cca5922021-11-11 10:05:00 +0000558 {
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000559 in[i].v = 0;
ramelg019cca5922021-11-11 10:05:00 +0000560 }
ramelg019cca5922021-11-11 10:05:00 +0000561
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000562 // Load input tile
563 for(int i = 0; i < K0; ++i)
564 {
565 if(yi + i < src_h)
566 {
567 in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
568 }
569 }
ramelg019cca5922021-11-11 10:05:00 +0000570
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000571 // Transpose input tile
572 for(int k0 = 0; k0 < K0; ++k0)
573 {
574 for(int n0 = 0; n0 < N0; ++n0)
575 {
576 in_tr[n0].s[k0] = in[k0].s[n0];
577 }
578 }
ramelg019cca5922021-11-11 10:05:00 +0000579
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000580 TILE(uint, N0, 1, dst_indirect_y);
581 for(int i = 0; i < N0; ++i)
582 {
583 dst_indirect_y[i].v = i;
584 }
ramelg019cca5922021-11-11 10:05:00 +0000585
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000586 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
ramelg019cca5922021-11-11 10:05:00 +0000587
588#undef BLOCK_SIZE
589#undef OUTPUT_OFFSET_X
590#undef OUTPUT_STEP_X
591}
Gian Marco Iodice4fb56702021-11-10 11:18:50 +0000592
593#endif // defined(RESHAPE_RHS_T)