blob: adb3a1c25dc7c2ad4e9dd482572d681272237eb7 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif0681e3b2019-04-25 14:28:07 +010024#include "gemm_helpers.h"
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +000025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +000027#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
28#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
29#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
30#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
31#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
32#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
33#define CONCAT_INC(K0) INC##K0
34#define INC(K0) CONCAT_INC(K0)
35
36#if(SRC_WIDTH % K0)
37#define BOUNDARY_CONDITION_X(x, a) \
38 ({ \
39 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
40 })
41#else // (SRC_WIDTH % K0)
42#define BOUNDARY_CONDITION_X(x, a) \
43 ({})
44#endif // (SRC_WIDTH % K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000045
46/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
47 * the output matrix unrolling the values.
48 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010049 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
50 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
51 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
52 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000053 * @note Only the following values for M0, K0 and V0 are supported:
54 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +000055 * K0: 2,3,4,8,16
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000056 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010057 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000058 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
59 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
60 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
61 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
62 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
63 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010064 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000065 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
66 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
67 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
68 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
69 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
70 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
71 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
72 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
73 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
74 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
75 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
76 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
77 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
78 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
79 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
80 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
81 */
82__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
83 TENSOR3D_DECLARATION(dst)
84#if defined(REINTERPRET_INPUT_AS_3D)
85 ,
86 uint cross_plane_pad
87#endif // REINTERPRET_INPUT_AS_3D
88 )
89{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +000090 // Block size
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000091#define BLOCK_SIZE ((M0) * (K0))
92
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +000093 // Output offset X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000094#if defined(INTERLEAVE)
95#define OUTPUT_OFFSET_X (K0)
96#else // defined(INTERLEAVE)
97#define OUTPUT_OFFSET_X (BLOCK_SIZE)
98#endif // defined(INTERLEAVE)
99
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000100 // Output step X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000101#if defined(INTERLEAVE)
102#define OUTPUT_STEP_X (K0) * (V0)
103#else // Do not interleave
104#define OUTPUT_STEP_X (K0)
105#endif // defined(INTERLEAVE)
106
107 // Compute source and destination addresses
108 uint x = get_global_id(0);
109 uint y = get_global_id(1);
110 uint z = get_global_id(2);
111
112 // ------------------ Compute input/output addresses ---------------------------
113
114 // Compute the input address
115 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
116
117 // Compute the output address
118 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
119 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
120
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000121 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
122 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000123
124#if defined(REINTERPRET_INPUT_AS_3D)
125 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
126 // multiply src_stride_z by DEPTH_GEMM3D
127
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000128 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
129
130 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100131 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000132
133#else // defined(REINTERPRET_INPUT_AS_3D)
134
135 input_ptr += z * (uint)src_stride_z;
136
137#endif // defined(REINTERPRET_INPUT_AS_3D)
138
139 // Add offset for batched GEMM
140 output_ptr += z * (uint)dst_stride_z;
141
142 // ---------------------------Load input values --------------------------------
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000143 // Load values from the LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +0100144 LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000145 BOUNDARY_CONDITION_X(x, a0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000146#if M0 > 1
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000147 BOUNDARY_CONDITION_X(x, a1);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000148#endif // M0 > 1
149#if M0 > 2
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000150 BOUNDARY_CONDITION_X(x, a2);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000151#endif // M0 > 2
152#if M0 > 3
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000153 BOUNDARY_CONDITION_X(x, a3);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000154#endif // M0 > 3
155#if M0 > 4
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000156 BOUNDARY_CONDITION_X(x, a4);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000157#endif // M0 > 4
158#if M0 > 5
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000159 BOUNDARY_CONDITION_X(x, a5);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000160#endif // M0 > 5
161#if M0 > 6
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000162 BOUNDARY_CONDITION_X(x, a6);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000163#endif // M0 > 6
164#if M0 > 7
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000165 BOUNDARY_CONDITION_X(x, a7);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000166#endif // M0 > 7
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000167 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100168 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
169 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000170
171#undef BLOCK_SIZE
172#undef OUTPUT_OFFSET_X
173#undef OUTPUT_STEP_X
174}
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000175
176#if M0 == 2
177#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
178 ({ \
179 VEC_DATA_TYPE(DATA_TYPE, M0) \
180 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
181 VSTORE(M0) \
182 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
183 })
184#elif M0 == 3 // M0 == 3
185#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
186 ({ \
187 VEC_DATA_TYPE(DATA_TYPE, M0) \
188 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
189 VSTORE(M0) \
190 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
191 })
192#elif M0 == 4 // M0 == 4
193#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
194 ({ \
195 VEC_DATA_TYPE(DATA_TYPE, M0) \
196 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
197 VSTORE(M0) \
198 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
199 })
200#elif M0 == 5 // M0 == 5
201#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
202 ({ \
203 VEC_DATA_TYPE(DATA_TYPE, 4) \
204 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
205 DATA_TYPE res1 = a4.s##i; \
206 VSTORE(4) \
207 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
208 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
209 })
210#elif M0 == 6 // M0 == 6
211#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
212 ({ \
213 VEC_DATA_TYPE(DATA_TYPE, 4) \
214 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
215 VEC_DATA_TYPE(DATA_TYPE, 2) \
216 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
217 VSTORE(4) \
218 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
219 VSTORE(2) \
220 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
221 })
222#elif M0 == 7 // M0 == 7
223#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
224 ({ \
225 VEC_DATA_TYPE(DATA_TYPE, 4) \
226 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
227 VEC_DATA_TYPE(DATA_TYPE, 3) \
228 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
229 VSTORE(4) \
230 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
231 VSTORE(3) \
232 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
233 })
234#elif M0 == 8 // M0 == 8
235#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
236 ({ \
237 VEC_DATA_TYPE(DATA_TYPE, M0) \
238 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
239 VSTORE(M0) \
240 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
241 })
242#else // M0 not supported
243#error "M0 value not supported"
244#endif // N0 conditions
245
246/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
247 * the output matrix unrolling the values.
248 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100249 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
250 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
251 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
252 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000253 * @note Only the following values for M0, K0 and V0 are supported:
254 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000255 * K0: 2,3,4,8,16
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000256 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100257 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000258 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
259 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
260 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
261 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
262 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
263 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100264 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000265 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
266 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
267 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
268 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
269 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
270 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
271 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
272 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
273 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
274 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
275 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
276 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
277 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
278 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
279 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
280 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
281 */
282__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
283 TENSOR3D_DECLARATION(dst)
284#if defined(REINTERPRET_INPUT_AS_3D)
285 ,
286 uint cross_plane_pad
287#endif // REINTERPRET_INPUT_AS_3D
288 )
289{
290 // Block size
291#define BLOCK_SIZE ((M0) * (K0))
292
293 // Output offset X
294#if defined(INTERLEAVE)
295#define OUTPUT_OFFSET_X (M0)
296#else // defined(INTERLEAVE)
297#define OUTPUT_OFFSET_X (BLOCK_SIZE)
298#endif // defined(INTERLEAVE)
299
300 // Output step X
301#if defined(INTERLEAVE)
302#define OUTPUT_STEP_X (M0) * (V0)
303#else // Do not interleave
304#define OUTPUT_STEP_X (M0)
305#endif // defined(INTERLEAVE)
306
307 // Compute source and destination addresses
308 uint x = get_global_id(0);
309 uint y = get_global_id(1);
310 uint z = get_global_id(2);
311
312 // ------------------ Compute input/output addresses ---------------------------
313
314 // Compute the input address
315 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
316
317 // Compute the output address
318 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
319 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
320
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000321 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
322 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000323
324#if defined(REINTERPRET_INPUT_AS_3D)
325 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
326 // multiply src_stride_z by DEPTH_GEMM3D
327
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000328 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
329
330 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100331 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000332
333#else // defined(REINTERPRET_INPUT_AS_3D)
334
335 input_ptr += z * (uint)src_stride_z;
336
337#endif // defined(REINTERPRET_INPUT_AS_3D)
338
339 // Add offset for batched GEMM
340 output_ptr += z * (uint)dst_stride_z;
341
342 // ---------------------------Load input values --------------------------------
343
344 // Load values from the LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +0100345 LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000346 BOUNDARY_CONDITION_X(x, a0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000347#if M0 > 1
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000348 BOUNDARY_CONDITION_X(x, a1);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000349#endif // M0 > 1
350#if M0 > 2
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000351 BOUNDARY_CONDITION_X(x, a2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000352#endif // M0 > 2
353#if M0 > 3
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000354 BOUNDARY_CONDITION_X(x, a3);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000355#endif // M0 > 3
356#if M0 > 4
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000357 BOUNDARY_CONDITION_X(x, a4);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000358#endif // M0 > 4
359#if M0 > 5
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000360 BOUNDARY_CONDITION_X(x, a5);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000361#endif // M0 > 5
362#if M0 > 6
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000363 BOUNDARY_CONDITION_X(x, a6);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000364#endif // M0 > 6
365#if M0 > 7
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000366 BOUNDARY_CONDITION_X(x, a7);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000367#endif // M0 > 7
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000368 // ---------------------------Transpose and store block -----------------------
369
370 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
371 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
372#if K0 > 2
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000374#endif // K0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000375#if K0 > 3
376 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
377#endif // K0 > 3
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000378#if K0 > 4
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
383#endif // K0 > 4
384#if K0 > 8
385 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
386 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
387 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
388 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
389 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
390 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
391 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
392 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
393#endif // K0 > 8
394
395#undef BLOCK_SIZE
396#undef OUTPUT_OFFSET_X
397#undef OUTPUT_STEP_X
398}
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +0000399#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000400
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000401#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
402/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
403 * the output matrix unrolling the values.
404 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100405 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
406 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
407 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
408 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000409 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
410 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000411 * N0: 2,3,4,8,16
412 * K0: 1,2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000413 * H0: greater than 0
414 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100415 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000416 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
417 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
419 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
421 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
423 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
424 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
425 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
426 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
427 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
428 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
429 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
430 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
431 */
432__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
433 TENSOR3D_DECLARATION(dst))
434{
435 // Block size
436#define BLOCK_SIZE ((K0) * (N0))
437
438 // Output offset X
439#if defined(INTERLEAVE)
440#define OUTPUT_OFFSET_X (N0)
441#else // defined(INTERLEAVE)
442#define OUTPUT_OFFSET_X (BLOCK_SIZE)
443#endif // defined(INTERLEAVE)
444
445 // Output step X
446#if defined(INTERLEAVE)
447#define OUTPUT_STEP_X (N0) * (H0)
448#else // Do not interleave
449#define OUTPUT_STEP_X (N0)
450#endif // defined(INTERLEAVE)
451
452 // Compute source and destination addresses
453 uint x = get_global_id(0);
454 uint y = get_global_id(1);
455 uint z = get_global_id(2);
456
457 // ------------------ Compute input/output addresses ---------------------------
458
459 // Compute the input address
460 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
461
462 // Compute the output address
463 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
464 x / (uint)H0)
465 * (uint)dst_stride_y)
466 + z * (uint)dst_stride_z;
467
468 // ---------------------------Load input values --------------------------------
469
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000470 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000471
472 // Load values from the RHS matrix
473 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
474#if K0 > 1
475 if(y * (uint)K0 + 1 < SRC_HEIGHT)
476 {
477 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
478 }
479#endif // K0 > 1
480#if K0 > 2
481 if(y * (uint)K0 + 2 < SRC_HEIGHT)
482 {
483 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
484 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000485#endif // K0 > 2
486#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000487 if(y * (uint)K0 + 3 < SRC_HEIGHT)
488 {
489 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
490 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000491#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000492#if K0 > 4
493 if(y * (uint)K0 + 4 < SRC_HEIGHT)
494 {
495 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
496 }
497 if(y * (uint)K0 + 5 < SRC_HEIGHT)
498 {
499 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
500 }
501 if(y * (uint)K0 + 6 < SRC_HEIGHT)
502 {
503 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
504 }
505 if(y * (uint)K0 + 7 < SRC_HEIGHT)
506 {
507 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
508 }
509#endif // K0 > 4
510#if K0 > 8
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000511 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000512 {
513 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
514 }
515 if(y * (uint)K0 + 9 < SRC_HEIGHT)
516 {
517 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
518 }
519 if(y * (uint)K0 + 10 < SRC_HEIGHT)
520 {
521 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
522 }
523 if(y * (uint)K0 + 11 < SRC_HEIGHT)
524 {
525 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
526 }
527 if(y * (uint)K0 + 12 < SRC_HEIGHT)
528 {
529 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
530 }
531 if(y * (uint)K0 + 13 < SRC_HEIGHT)
532 {
533 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
534 }
535 if(y * (uint)K0 + 14 < SRC_HEIGHT)
536 {
537 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
538 }
539 if(y * (uint)K0 + 15 < SRC_HEIGHT)
540 {
541 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
542 }
543#endif // K0 > 8
544
545 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100546 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
547 STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000548
549#undef BLOCK_SIZE
550#undef OUTPUT_OFFSET_X
551#undef OUTPUT_STEP_X
552}
553
554#if defined(TRANSPOSE)
555/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
556 * the output matrix unrolling the values.
557 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100558 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
559 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
560 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
561 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000562 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
563 * @note The option -DTRANSPOSE must passed at compile time.
564 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000565 * N0: 2,3,4,8,16
566 * K0: 2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000567 * H0: greater than 0
568 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100569 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000570 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
571 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
572 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
573 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
574 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
575 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
576 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
577 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
578 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
579 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
580 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
581 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
582 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
583 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
584 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
585 */
586__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
587 TENSOR3D_DECLARATION(dst))
588{
589 // Block size
590#define BLOCK_SIZE ((K0) * (N0))
591
592 // Output offset X
593#if defined(INTERLEAVE)
594#define OUTPUT_OFFSET_X (K0)
595#else // defined(INTERLEAVE)
596#define OUTPUT_OFFSET_X (BLOCK_SIZE)
597#endif // defined(INTERLEAVE)
598
599 // Output step X
600#if defined(INTERLEAVE)
601#define OUTPUT_STEP_X (K0) * (H0)
602#else // Do not interleave
603#define OUTPUT_STEP_X (K0)
604#endif // defined(INTERLEAVE)
605
606 // Compute source and destination addresses
607 uint x = get_global_id(0);
608 uint y = get_global_id(1);
609 uint z = get_global_id(2);
610
611 // ------------------ Compute input/output addresses ---------------------------
612
613 // Compute the input address
614 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
615
616 // Compute the output address
617 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
618 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
619
620 // ---------------------------Load input values --------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000621 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000622
623 // Load values from the RHS matrix
624 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
625 if(y * (uint)K0 + 1 < SRC_HEIGHT)
626 {
627 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
628 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000629#if K0 > 2
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000630 if(y * (uint)K0 + 2 < SRC_HEIGHT)
631 {
632 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
633 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000634#endif // K0 > 2
635#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000636 if(y * (uint)K0 + 3 < SRC_HEIGHT)
637 {
638 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
639 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000640#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000641#if K0 > 4
642 if(y * (uint)K0 + 4 < SRC_HEIGHT)
643 {
644 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
645 }
646 if(y * (uint)K0 + 5 < SRC_HEIGHT)
647 {
648 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
649 }
650 if(y * (uint)K0 + 6 < SRC_HEIGHT)
651 {
652 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
653 }
654 if(y * (uint)K0 + 7 < SRC_HEIGHT)
655 {
656 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
657 }
658#endif // K0 > 4
659#if K0 > 8
Gian Marco Iodice89124342018-12-19 14:17:22 +0000660 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000661 {
662 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
663 }
664 if(y * (uint)K0 + 9 < SRC_HEIGHT)
665 {
666 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
667 }
668 if(y * (uint)K0 + 10 < SRC_HEIGHT)
669 {
670 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
671 }
672 if(y * (uint)K0 + 11 < SRC_HEIGHT)
673 {
674 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
675 }
676 if(y * (uint)K0 + 12 < SRC_HEIGHT)
677 {
678 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
679 }
680 if(y * (uint)K0 + 13 < SRC_HEIGHT)
681 {
682 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
683 }
684 if(y * (uint)K0 + 14 < SRC_HEIGHT)
685 {
686 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
687 }
688 if(y * (uint)K0 + 15 < SRC_HEIGHT)
689 {
690 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
691 }
692#endif // K0 > 8
693
694 // ---------------------------Transpose the block ------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000695 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000696
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000697#if K0 == 2
698 // This part computes the following transpositions:
699 // 2x2 -> 2x2
700 // 2x4 -> 4x2
701 // 2x8 -> 8x2
702 // 2x16 -> 16x2
703 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
704 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
705#if N0 > 2
706 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
707#endif // N0 > 2
708#if N0 > 3
709 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
710#endif // N0 > 3
711#if N0 > 4
712 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
713 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
714 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
715 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
716#endif // N0 > 4
717#if N0 > 8
718 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
719 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
720 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
721 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
722 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
723 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
724 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
725 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
726#endif // N0 > 8
727
728#elif K0 == 3 // K0 == 2
729 // This part computes the following transpositions:
730 // 3x2 -> 2x3
731 // 3x4 -> 4x3
732 // 3x8 -> 8x3
733 // 3x16 -> 16x3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100734 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
735 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000736#if N0 > 2
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100737 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000738#endif // N0 > 2
739#if N0 > 3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100740 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000741#endif // N0 > 3
742#if N0 > 4
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100743 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
744 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
745 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
746 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000747#endif // N0 > 4
748#if N0 > 8
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100749 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
750 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
751 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
752 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
753 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
754 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
755 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
756 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000757#endif // N0 > 8
758
759#elif K0 == 4 // K0 == 4
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000760 // This part computes the following transpositions:
761 // 4x2 -> 2x4
762 // 4x4 -> 4x4
763 // 4x8 -> 8x4
764 // 4x16 -> 16x4
765 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
766 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
767#if N0 > 2
768 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000769#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000770#if N0 > 3
771 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
772#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000773#if N0 > 4
774 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
775 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
776 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
777 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
778#endif // N0 > 4
779#if N0 > 8
780 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
781 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
782 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
783 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
784 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
785 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
786 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
787 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
788#endif // N0 > 8
789
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000790#elif K0 == 8 // K0 == 8
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000791 // This part computes the following transpositions:
792 // 8x2 -> 2x8
793 // 8x4 -> 4x8
794 // 8x8 -> 8x8
795 // 8x16 -> 16x8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000796 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
797 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000798#if N0 > 2
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000799 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000800#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000801#if N0 > 3
802 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
803#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000804#if N0 > 4
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000805 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
806 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
807 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
808 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000809#endif // N0 > 4
810#if N0 > 8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000811 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
812 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
813 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
814 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
815 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
816 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
817 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
818 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000819#endif // N0 > 8
820
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000821#elif K0 == 16 // K0 == 16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000822
823 // This part computes the following transpositions:
824 // 16x2 -> 2x16
825 // 16x4 -> 4x16
826 // 16x8 -> 8x16
827 // 16x16 -> 16x16
828 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
829 a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
830 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
831 a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
832#if N0 > 2
833 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
834 a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000835#endif // N0 > 2
836#if N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000837 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
838 a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000839#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000840#if N0 > 4
841 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
842 a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
843 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
844 a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
845 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
846 a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
847 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
848 a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
849#endif // N0 > 4
850#if N0 > 8
851 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
852 a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
853 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
854 a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
855 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
856 a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
857 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
858 a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
859 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
860 a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
861 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
862 a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
863 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
864 a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
865 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
866 a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
867#endif // N0 > 8
868
869#else // N0 == 16
870#error "Not supported N0 value"
871#endif // N0 > 2
872
873 // ---------------------------Store the output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100874 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
875 STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000876
877#undef BLOCK_SIZE
878#undef OUTPUT_OFFSET_X
879#undef OUTPUT_STEP_X
880}
881#endif // defined(TRANSPOSE)
882#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
883
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000884#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +0000885
886#define CONCAT(a, b) a##b
887
888#define ARM_DOT1(a, b, c) \
889 ({ \
890 c = fma(a, b, c); \
891 })
892#define ARM_DOT2(a, b, c) \
893 ({ \
894 c = fma(a.s0, b.s0, c); \
895 c = fma(a.s1, b.s1, c); \
896 })
897#define ARM_DOT3(a, b, c) \
898 ({ \
899 ARM_DOT2(a, b, c); \
900 c = fma((a.s2), (b.s2), c); \
901 })
902#define ARM_DOT4(a, b, c) \
903 ({ \
904 ARM_DOT3(a, b, c); \
905 c = fma((a.s3), (b.s3), c); \
906 })
907#define ARM_DOT8(a, b, c) \
908 ({ \
909 ARM_DOT4((a.lo), (b.lo), c); \
910 ARM_DOT4((a.hi), (b.hi), c); \
911 })
912#define ARM_DOT16(a, b, c) \
913 ({ \
914 ARM_DOT8((a.lo), (b.lo), c); \
915 ARM_DOT8((a.hi), (b.hi), c); \
916 })
917
918#if N0 == 2
919#define ARM_DOT_K0XN0(k0, a, b, c) \
920 ({ \
921 CONCAT(ARM_DOT, k0) \
922 ((a), (b##0), (c.s0)); \
923 CONCAT(ARM_DOT, k0) \
924 ((a), (b##1), (c.s1)); \
925 })
926#elif N0 == 3 // N0 == 3
927#define ARM_DOT_K0XN0(k0, a, b, c) \
928 ({ \
929 CONCAT(ARM_DOT, k0) \
930 ((a), (b##0), (c.s0)); \
931 CONCAT(ARM_DOT, k0) \
932 ((a), (b##1), (c.s1)); \
933 CONCAT(ARM_DOT, k0) \
934 ((a), (b##2), (c.s2)); \
935 })
936#elif N0 == 4 // N0 == 4
937#define ARM_DOT_K0XN0(k0, a, b, c) \
938 ({ \
939 CONCAT(ARM_DOT, k0) \
940 ((a), (b##0), (c.s0)); \
941 CONCAT(ARM_DOT, k0) \
942 ((a), (b##1), (c.s1)); \
943 CONCAT(ARM_DOT, k0) \
944 ((a), (b##2), (c.s2)); \
945 CONCAT(ARM_DOT, k0) \
946 ((a), (b##3), (c.s3)); \
947 })
948#elif N0 == 8 // N0 == 8
949#define ARM_DOT_K0XN0(k0, a, b, c) \
950 ({ \
951 CONCAT(ARM_DOT, k0) \
952 ((a), (b##0), (c.s0)); \
953 CONCAT(ARM_DOT, k0) \
954 ((a), (b##1), (c.s1)); \
955 CONCAT(ARM_DOT, k0) \
956 ((a), (b##2), (c.s2)); \
957 CONCAT(ARM_DOT, k0) \
958 ((a), (b##3), (c.s3)); \
959 CONCAT(ARM_DOT, k0) \
960 ((a), (b##4), (c.s4)); \
961 CONCAT(ARM_DOT, k0) \
962 ((a), (b##5), (c.s5)); \
963 CONCAT(ARM_DOT, k0) \
964 ((a), (b##6), (c.s6)); \
965 CONCAT(ARM_DOT, k0) \
966 ((a), (b##7), (c.s7)); \
967 })
968#elif N0 == 16 // N0 == 16
969#define ARM_DOT_K0XN0(k0, a, b, c) \
970 ({ \
971 CONCAT(ARM_DOT, k0) \
972 ((a), (b##0), (c.s0)); \
973 CONCAT(ARM_DOT, k0) \
974 ((a), (b##1), (c.s1)); \
975 CONCAT(ARM_DOT, k0) \
976 ((a), (b##2), (c.s2)); \
977 CONCAT(ARM_DOT, k0) \
978 ((a), (b##3), (c.s3)); \
979 CONCAT(ARM_DOT, k0) \
980 ((a), (b##4), (c.s4)); \
981 CONCAT(ARM_DOT, k0) \
982 ((a), (b##5), (c.s5)); \
983 CONCAT(ARM_DOT, k0) \
984 ((a), (b##6), (c.s6)); \
985 CONCAT(ARM_DOT, k0) \
986 ((a), (b##7), (c.s7)); \
987 CONCAT(ARM_DOT, k0) \
988 ((a), (b##8), (c.s8)); \
989 CONCAT(ARM_DOT, k0) \
990 ((a), (b##9), (c.s9)); \
991 CONCAT(ARM_DOT, k0) \
992 ((a), (b##A), (c.sA)); \
993 CONCAT(ARM_DOT, k0) \
994 ((a), (b##B), (c.sB)); \
995 CONCAT(ARM_DOT, k0) \
996 ((a), (b##C), (c.sC)); \
997 CONCAT(ARM_DOT, k0) \
998 ((a), (b##D), (c.sD)); \
999 CONCAT(ARM_DOT, k0) \
1000 ((a), (b##E), (c.sE)); \
1001 CONCAT(ARM_DOT, k0) \
1002 ((a), (b##F), (c.sF)); \
1003 })
1004#else // N0 not supported
1005#error "N0 value not supported"
1006#endif // N0 conditions
1007
1008/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1009 * The LHS matrix is NOT reshaped
1010 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1011 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001012 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001013 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1014 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
1015 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1016 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1017 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001018 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001019 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1020 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001021 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1022 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1023 * - N0 = 2, 3, 4, 8, 16
1024 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001025 * - H0 >= 1
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001026 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001027 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001028 * The activation function is performed after the bias addition
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001029 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1030 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1031 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1032 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1033 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1034 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1035 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001036 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1037 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001038 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001039 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001040 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001041 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001042 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1043 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1044 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1045 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1046 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1047 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001048 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1049 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1050 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1051 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1052 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1053 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001054 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1055 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1056 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1057 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1058 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1059 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001060 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001061 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001062 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001063 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1064 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1065 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001066 */
1067__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1068 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001069#if defined(BETA)
1070 IMAGE_DECLARATION(bias),
1071#endif // defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001072 IMAGE_DECLARATION(dst),
1073 uint lhs_stride_z,
1074 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001075#if defined(BETA)
1076 uint bias_stride_z,
1077#endif //defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001078 uint dst_stride_z
1079#if defined(REINTERPRET_INPUT_AS_3D)
1080 ,
1081 uint lhs_cross_plane_pad
1082#endif // REINTERPRET_INPUT_AS_3D
1083#if defined(REINTERPRET_OUTPUT_AS_3D)
1084 ,
1085 uint dst_cross_plane_pad
1086#endif // REINTERPRET_OUTPUT_AS_3D
1087 )
1088{
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001089 // Block size
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001090#define RHS_BLOCK_SIZE ((K0) * (N0))
1091
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001092 // RHS offset and step X
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001093#if defined(RHS_INTERLEAVE)
1094#define RHS_OFFSET_X (K0)
1095#define RHS_STEP_X ((K0) * (H0))
1096#define RHS_STEP_LOOP (1)
1097#else // defined(RHS_INTERLEAVE)
1098#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1099#define RHS_STEP_X (K0)
1100#define RHS_STEP_LOOP (H0)
1101#endif // defined(RHS_INTERLEAVE)
1102
1103 uint x = get_global_id(0);
1104 uint y = get_global_id(1);
1105 uint z = get_global_id(2);
1106
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001107#if defined(DUMMY_WORK_ITEMS)
1108 if((x * N0 >= N) || (y * M0 >= M))
1109 {
1110 return;
1111 }
1112#endif // defined(DUMMY_WORK_ITEMS)
1113
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001114 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001115 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001116
Sheri Zhang1a378102020-04-30 12:59:39 +01001117 // Compute RHS reshaped matrix address
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001118 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1119
1120#if defined(MATRIX_B_DEPTH)
1121 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1122 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1123#else // defined(MATRIX_B_DEPTH)
1124 rhs_offset += z * rhs_stride_z;
1125#endif // defined(MATRIX_B_DEPTH)
1126
Usama Arif0681e3b2019-04-25 14:28:07 +01001127 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001128 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001129
1130#if defined(REINTERPRET_INPUT_AS_3D)
Usama Arif0681e3b2019-04-25 14:28:07 +01001131 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1132 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001133
1134 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1135 // multiply lhs_stride_z by DEPTH_GEMM3D
1136 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1137
1138#else // defined(REINTERPRET_INPUT_AS_3D)
1139
1140 // Add offset for batched GEMM
1141 lhs_offset += z * lhs_stride_z;
1142
1143#endif // defined(REINTERPRET_INPUT_AS_3D)
1144
1145 // Initialize the accumulators
1146 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1147
1148 int i = 0;
1149 for(; i <= (K - K0); i += K0)
1150 {
1151 // Supported cases (M0, K0):
1152 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1153 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1154 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1155 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1156 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1157 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1158 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1159 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1160 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001161 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001162
Sheri Zhang1a378102020-04-30 12:59:39 +01001163 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001164 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001165
1166 // Accumulate
1167 ARM_DOT_K0XN0(K0, a0, b, c0);
1168#if M0 > 1
1169 ARM_DOT_K0XN0(K0, a1, b, c1);
1170#endif // M0 > 1
1171#if M0 > 2
1172 ARM_DOT_K0XN0(K0, a2, b, c2);
1173#endif // M0 > 2
1174#if M0 > 3
1175 ARM_DOT_K0XN0(K0, a3, b, c3);
1176#endif // M0 > 3
1177#if M0 > 4
1178 ARM_DOT_K0XN0(K0, a4, b, c4);
1179#endif // M0 > 4
1180#if M0 > 5
1181 ARM_DOT_K0XN0(K0, a5, b, c5);
1182#endif // M0 > 5
1183#if M0 > 6
1184 ARM_DOT_K0XN0(K0, a6, b, c6);
1185#endif // M0 > 6
1186#if M0 > 7
1187 ARM_DOT_K0XN0(K0, a7, b, c7);
1188#endif // M0 > 7
1189
1190 lhs_offset += K0 * sizeof(DATA_TYPE);
1191 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
1192 }
1193
1194 // Left-over accumulations
1195 for(; i < K; ++i)
1196 {
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001197 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001198 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001199
Sheri Zhang1a378102020-04-30 12:59:39 +01001200 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001201 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001202
1203 // Accumulate
1204 ARM_DOT_K0XN0(1, a0, b, c0);
1205#if M0 > 1
1206 ARM_DOT_K0XN0(1, a1, b, c1);
1207#endif // M0 > 1
1208#if M0 > 2
1209 ARM_DOT_K0XN0(1, a2, b, c2);
1210#endif // M0 > 2
1211#if M0 > 3
1212 ARM_DOT_K0XN0(1, a3, b, c3);
1213#endif // M0 > 3
1214#if M0 > 4
1215 ARM_DOT_K0XN0(1, a4, b, c4);
1216#endif // M0 > 4
1217#if M0 > 5
1218 ARM_DOT_K0XN0(1, a5, b, c5);
1219#endif // M0 > 5
1220#if M0 > 6
1221 ARM_DOT_K0XN0(1, a6, b, c6);
1222#endif // M0 > 6
1223#if M0 > 7
1224 ARM_DOT_K0XN0(1, a7, b, c7);
1225#endif // M0 > 7
1226
1227 lhs_offset += sizeof(DATA_TYPE);
1228 rhs_offset += sizeof(DATA_TYPE);
1229 }
1230
SiCong Li406a13f2020-07-15 12:09:58 +01001231 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001232
1233 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1234
1235#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001236
1237 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001238 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001239
1240 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1241 // multiply dst_stride_z by DEPTH_GEMM3D
1242 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1243
1244#else // defined(REINTERPRET_OUTPUT_AS_3D)
1245
1246 // Add offset for batched GEMM
1247 dst_addr += z * dst_stride_z;
1248
1249#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1250
1251 // Multiply by the weight of matrix-matrix product and store the result
1252#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001253 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001254#endif // defined(ALPHA)
1255
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001256 // Add beta*bias
1257#if defined(BETA)
1258#if defined(BROADCAST_BIAS)
1259 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1260
1261 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1262
1263#ifndef UNIT_BETA
1264 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1265#endif // UNIT_BIAS
1266
1267 // c = c + bias[broadcasted]
1268 ADD_BLOCK_BROADCAST(M0, c, bias0);
1269
1270#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001271 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001272
1273 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1274
1275#ifndef UNIT_BETA
1276 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1277#endif // UNIT_BIAS
1278
1279 // c = c + bias
1280 ADD_BLOCK(M0, c, bias);
1281
1282#endif // defined(BROADCAST_BIAS)
1283#endif // defined(BETA)
1284
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001285#if defined(ACTIVATION_TYPE)
1286 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
1287#endif // defined(ACTIVATION_TYPE)
1288
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001289 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01001290 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001291
1292#undef RHS_BLOCK_SIZE
1293#undef RHS_OFFSET_X
1294#undef RHS_STEP_X
1295}
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001296
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001297#if defined(OPENCL_IMAGE_SUPPORT)
1298/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
1299 * The LHS matrix is NOT reshaped
1300 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1301 *
1302 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
1303 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1304 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1305 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
1306 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
1307 * could be different from the value returned by get_image_height(rhs_img).
1308 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1309 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1310 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
1311 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001312 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1313 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001314 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1315 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1316 * - N0 = 4, 8, 16
1317 * - K0 = 4, 8, 16
1318 * - H0 >= 1
1319 *
1320 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
1321 * The activation function is performed after the bias addition
1322 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1323 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1324 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1325 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1326 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1327 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1328 *
1329 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
1330 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
1331 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1332 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
1333 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1334 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
1335 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
1336 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1337 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1338 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1339 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1340 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1341 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1342 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1343 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1344 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1345 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1346 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1347 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1348 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
1349 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1350 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
1351 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1352 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1353 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1354 */
1355__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
1356 __read_only image2d_t rhs_img,
1357#if defined(BETA)
1358 IMAGE_DECLARATION(bias),
1359#endif // defined(BETA)
1360 IMAGE_DECLARATION(dst),
1361 uint lhs_stride_z,
1362 uint rhs_stride_z,
1363#if defined(BETA)
1364 uint bias_stride_z,
1365#endif //defined(BETA)
1366 uint dst_stride_z
1367#if defined(REINTERPRET_INPUT_AS_3D)
1368 ,
1369 uint lhs_cross_plane_pad
1370#endif // REINTERPRET_INPUT_AS_3D
1371#if defined(REINTERPRET_OUTPUT_AS_3D)
1372 ,
1373 uint dst_cross_plane_pad
1374#endif // REINTERPRET_OUTPUT_AS_3D
1375 )
1376{
1377 // Pixel unit
1378#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
1379
1380#define LEFTOVER_K (K % K0)
1381
1382 // Block size
1383#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
1384
1385 // RHS offset and step X
1386#if defined(RHS_INTERLEAVE)
1387#define RHS_OFFSET_X (PIXEL_UNIT)
1388#define RHS_STEP_X (PIXEL_UNIT * (H0))
1389#define RHS_STEP_LOOP (1)
1390#else // defined(RHS_INTERLEAVE)
1391#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1392#define RHS_STEP_X PIXEL_UNIT
1393#define RHS_STEP_LOOP (H0)
1394#endif // defined(RHS_INTERLEAVE)
1395
1396 uint x = get_global_id(0);
1397 uint y = get_global_id(1);
1398 uint z = get_global_id(2);
1399
1400#if defined(DUMMY_WORK_ITEMS)
1401 if((x * N0 >= N) || (y * M0 >= M))
1402 {
1403 return;
1404 }
1405#endif // defined(DUMMY_WORK_ITEMS)
1406
1407 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001408 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001409
1410#if defined(MATRIX_B_DEPTH)
1411 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1412 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
1413#else // defined(MATRIX_B_DEPTH)
1414 const uint z_rhs = get_global_id(2);
1415#endif // defined(MATRIX_B_DEPTH)
1416
1417 // Compute RHS matrix coordinates
1418 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
1419 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
1420
1421 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
1422 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
1423
1424#if defined(REINTERPRET_INPUT_AS_3D)
1425 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1426 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
1427
1428 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1429 // multiply lhs_stride_z by DEPTH_GEMM3D
1430 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1431
1432#else // defined(REINTERPRET_INPUT_AS_3D)
1433
1434 // Add offset for batched GEMM
1435 lhs_offset += z * lhs_stride_z;
1436
1437#endif // defined(REINTERPRET_INPUT_AS_3D)
1438
1439 // Initialize the accumulators
1440 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
1441
1442 int i = 0;
1443 for(; i <= (K - K0); i += K0)
1444 {
1445 // Load values from LHS matrix
1446 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1447
1448 // Load values from RHS matrix stored in a cl_image
1449 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1450 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1451
1452 // Accumulate
1453 ARM_DOT_K0XN0(K0, a0, b, c0);
1454#if M0 > 1
1455 ARM_DOT_K0XN0(K0, a1, b, c1);
1456#endif // M0 > 1
1457#if M0 > 2
1458 ARM_DOT_K0XN0(K0, a2, b, c2);
1459#endif // M0 > 2
1460#if M0 > 3
1461 ARM_DOT_K0XN0(K0, a3, b, c3);
1462#endif // M0 > 3
1463#if M0 > 4
1464 ARM_DOT_K0XN0(K0, a4, b, c4);
1465#endif // M0 > 4
1466#if M0 > 5
1467 ARM_DOT_K0XN0(K0, a5, b, c5);
1468#endif // M0 > 5
1469#if M0 > 6
1470 ARM_DOT_K0XN0(K0, a6, b, c6);
1471#endif // M0 > 6
1472#if M0 > 7
1473 ARM_DOT_K0XN0(K0, a7, b, c7);
1474#endif // M0 > 7
1475
1476 lhs_offset += K0 * sizeof(DATA_TYPE);
1477 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1478 }
1479
1480#if LEFTOVER_K != 0
1481 // Note: We cannot read out-of-bound elements from the RHS matrix because
1482 // the RHS width is always multiple of K0. This is not be true for the LHS matrix
1483
1484 union UNION_VEC_TYPE
1485 {
1486 DATA_TYPE s[K0];
1487 VEC_DATA_TYPE(DATA_TYPE, K0)
1488 v;
1489 };
1490
1491 union UNION_VEC_TYPE a0 = {.v = 0 };
1492#if M0 > 1
1493 union UNION_VEC_TYPE a1 = {.v = 0 };
1494#endif // M0 > 1
1495#if M0 > 2
1496 union UNION_VEC_TYPE a2 = {.v = 0 };
1497#endif // M0 > 2
1498#if M0 > 3
1499 union UNION_VEC_TYPE a3 = {.v = 0 };
1500#endif // M0 > 3
1501#if M0 > 4
1502 union UNION_VEC_TYPE a4 = {.v = 0 };
1503#endif // M0 > 4
1504#if M0 > 5
1505 union UNION_VEC_TYPE a5 = {.v = 0 };
1506#endif // M0 > 5
1507#if M0 > 6
1508 union UNION_VEC_TYPE a6 = {.v = 0 };
1509#endif // M0 > 6
1510#if M0 > 7
1511 union UNION_VEC_TYPE a7 = {.v = 0 };
1512#endif // M0 > 7
1513
1514 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1515
1516 // Load from RHS matrix
1517 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1518
1519 // Load from LHS matrix
1520 for(int k = 0; k < LEFTOVER_K; ++k)
1521 {
1522 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
1523#if M0 > 1
1524 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
1525#endif // M0 > 1
1526#if M0 > 2
1527 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
1528#endif // M0 > 2
1529#if M0 > 3
1530 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
1531#endif // M0 > 3
1532#if M0 > 4
1533 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
1534#endif // M0 > 4
1535#if M0 > 5
1536 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
1537#endif // M0 > 5
1538#if M0 > 6
1539 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
1540#endif // M0 > 6
1541#if M0 > 7
1542 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
1543#endif // M0 > 7
1544
1545 lhs_offset += sizeof(DATA_TYPE);
1546 }
1547
1548 // Accumulate
1549 ARM_DOT_K0XN0(K0, a0.v, b, c0);
1550#if M0 > 1
1551 ARM_DOT_K0XN0(K0, a1.v, b, c1);
1552#endif // M0 > 1
1553#if M0 > 2
1554 ARM_DOT_K0XN0(K0, a2.v, b, c2);
1555#endif // M0 > 2
1556#if M0 > 3
1557 ARM_DOT_K0XN0(K0, a3.v, b, c3);
1558#endif // M0 > 3
1559#if M0 > 4
1560 ARM_DOT_K0XN0(K0, a4.v, b, c4);
1561#endif // M0 > 4
1562#if M0 > 5
1563 ARM_DOT_K0XN0(K0, a5.v, b, c5);
1564#endif // M0 > 5
1565#if M0 > 6
1566 ARM_DOT_K0XN0(K0, a6.v, b, c6);
1567#endif // M0 > 6
1568#if M0 > 7
1569 ARM_DOT_K0XN0(K0, a7.v, b, c7);
1570#endif // M0 > 7
1571
1572#endif // LEFTOVER_K != 0
1573
SiCong Li406a13f2020-07-15 12:09:58 +01001574 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001575
1576 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1577
1578#if defined(REINTERPRET_OUTPUT_AS_3D)
1579
1580 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1581 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
1582
1583 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1584 // multiply dst_stride_z by DEPTH_GEMM3D
1585 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1586
1587#else // defined(REINTERPRET_OUTPUT_AS_3D)
1588
1589 // Add offset for batched GEMM
1590 dst_addr += z * dst_stride_z;
1591
1592#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1593
1594 // Multiply by the weight of matrix-matrix product and store the result
1595#if defined(ALPHA)
1596 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
1597#endif // defined(ALPHA)
1598
1599 // Add beta*bias
1600#if defined(BETA)
1601#if defined(BROADCAST_BIAS)
1602 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1603
1604 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1605
1606#ifndef UNIT_BETA
1607 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1608#endif // UNIT_BIAS
1609
1610 // c = c + bias[broadcasted]
1611 ADD_BLOCK_BROADCAST(M0, c, bias0);
1612
1613#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001614 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001615
1616 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1617
1618#ifndef UNIT_BETA
1619 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1620#endif // UNIT_BIAS
1621
1622 // c = c + bias
1623 ADD_BLOCK(M0, c, bias);
1624
1625#endif // defined(BROADCAST_BIAS)
1626#endif // defined(BETA)
1627
1628#if defined(ACTIVATION_TYPE)
1629 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
1630#endif // defined(ACTIVATION_TYPE)
1631
1632 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01001633 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001634
1635#undef RHS_BLOCK_SIZE
1636#undef RHS_OFFSET_X
1637#undef RHS_STEP_X
1638#undef LEFTOVER_K
1639#undef PIXEL_UNIT
1640}
1641#endif // defined(OPENCL_IMAGE_SUPPORT)
1642
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001643#define VFMA(a, b, c) \
1644 ({ \
1645 c = fma(a, b, c); \
1646 })
1647
1648#if M0 == 1
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001649#define VFMA_M0xN0(i, a, b, c) \
1650 ({ \
1651 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001652 })
1653#elif M0 == 2 // M0 == 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001654#define VFMA_M0xN0(i, a, b, c) \
1655 ({ \
1656 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1657 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001658 })
1659#elif M0 == 3 // M0 == 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001660#define VFMA_M0xN0(i, a, b, c) \
1661 ({ \
1662 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1663 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1664 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001665 })
1666#elif M0 == 4 // M0 == 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001667#define VFMA_M0xN0(i, a, b, c) \
1668 ({ \
1669 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1670 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1671 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1672 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001673 })
1674#elif M0 == 5 // M0 == 5
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001675#define VFMA_M0xN0(i, a, b, c) \
1676 ({ \
1677 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1678 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1679 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1680 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1681 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001682 })
1683#elif M0 == 6 // M0 == 6
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001684#define VFMA_M0xN0(i, a, b, c) \
1685 ({ \
1686 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1687 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1688 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1689 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1690 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1691 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001692 })
1693#elif M0 == 7 // M0 == 7
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001694#define VFMA_M0xN0(i, a, b, c) \
1695 ({ \
1696 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1697 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1698 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1699 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1700 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1701 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1702 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001703 })
1704#elif M0 == 8 // M0 == 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001705#define VFMA_M0xN0(i, a, b, c) \
1706 ({ \
1707 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1708 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1709 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1710 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1711 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1712 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1713 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
1714 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001715 })
1716#else // M0 not supported
1717#error "M0 not supported"
1718#endif // M0 not supported
1719
1720/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1721 * The LHS matrix is NOT reshaped
1722 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
1723 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001724 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001725 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
1726 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1727 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1728 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001729 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001730 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1731 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001732 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1733 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1734 * - N0 = 2, 3, 4, 8, 16
1735 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001736 * - H0 >= 1
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001737 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001738 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001739 * The activation function is performed after the bias addition
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001740 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1741 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1742 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1743 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1744 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1745 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1746 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001747 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1748 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001749 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001750 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001751 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001752 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001753 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1754 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1755 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1756 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1757 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1758 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001759 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1760 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001761 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001762 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001763 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1764 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1765 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1766 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1767 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1768 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1769 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1770 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001771 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001772 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001773 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001774 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1775 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1776 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001777 */
1778__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
1779 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001780#if defined(BETA)
1781 IMAGE_DECLARATION(bias),
1782#endif // defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001783 IMAGE_DECLARATION(dst),
1784 uint lhs_stride_z,
1785 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001786#if defined(BETA)
1787 uint bias_stride_z,
1788#endif //defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001789 uint dst_stride_z
1790#if defined(REINTERPRET_INPUT_AS_3D)
1791 ,
1792 uint lhs_cross_plane_pad
1793#endif // REINTERPRET_INPUT_AS_3D
1794#if defined(REINTERPRET_OUTPUT_AS_3D)
1795 ,
1796 uint dst_cross_plane_pad
1797#endif // REINTERPRET_OUTPUT_AS_3D
1798 )
1799{
1800 // Block size
1801#define RHS_BLOCK_SIZE ((K0) * (N0))
1802
1803 // RHS offset and step X
1804#if defined(RHS_INTERLEAVE)
1805#define RHS_OFFSET_X (N0)
1806#define RHS_STEP_X ((N0) * (H0))
1807#define RHS_STEP_LOOP (1)
1808#else // defined(RHS_INTERLEAVE)
1809#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1810#define RHS_STEP_X (N0)
1811#define RHS_STEP_LOOP (H0)
1812#endif // defined(RHS_INTERLEAVE)
1813
1814 uint x = get_global_id(0);
1815 uint y = get_global_id(1);
1816 uint z = get_global_id(2);
1817
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001818#if defined(DUMMY_WORK_ITEMS)
1819 if((x * N0 >= N) || (y * M0 >= M))
1820 {
1821 return;
1822 }
1823#endif // defined(DUMMY_WORK_ITEMS)
1824
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001825 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001826 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001827
Sheri Zhang1a378102020-04-30 12:59:39 +01001828 // Compute RHS reshaped matrix address
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001829 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1830
1831#if defined(MATRIX_B_DEPTH)
1832 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1833 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1834#else // defined(MATRIX_B_DEPTH)
1835 rhs_offset += z * rhs_stride_z;
1836#endif // defined(MATRIX_B_DEPTH)
1837
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001838 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
1839 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001840
1841#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001842
1843 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001844 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001845
1846 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1847 // multiply lhs_stride_z by DEPTH_GEMM3D
1848 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1849
1850#else // defined(REINTERPRET_INPUT_AS_3D)
1851
1852 // Add offset for batched GEMM
1853 lhs_offset += z * lhs_stride_z;
1854
1855#endif // defined(REINTERPRET_INPUT_AS_3D)
1856
1857 // Initialize the accumulators
1858 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1859
1860 int i = 0;
1861 for(; i <= (K - K0); i += K0)
1862 {
1863 // Supported cases (M0, K0):
1864 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1865 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1866 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1867 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1868 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1869 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1870 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1871 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1872 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001873 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001874
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001875 VEC_DATA_TYPE(DATA_TYPE, N0)
1876 b0;
1877
1878 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1879 VFMA_M0xN0(0, a, b0, c);
1880 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
1881 VFMA_M0xN0(1, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001882#if K0 > 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001883 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
1884 VFMA_M0xN0(2, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001885#endif // K0 > 2
1886#if K0 > 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001887 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
1888 VFMA_M0xN0(3, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001889#endif // K0 > 3
1890#if K0 > 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001891 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
1892 VFMA_M0xN0(4, a, b0, c);
1893 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
1894 VFMA_M0xN0(5, a, b0, c);
1895 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
1896 VFMA_M0xN0(6, a, b0, c);
1897 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
1898 VFMA_M0xN0(7, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001899#endif // K0 > 4
1900#if K0 > 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001901 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
1902 VFMA_M0xN0(8, a, b0, c);
1903 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
1904 VFMA_M0xN0(9, a, b0, c);
1905 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
1906 VFMA_M0xN0(A, a, b0, c);
1907 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
1908 VFMA_M0xN0(B, a, b0, c);
1909 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
1910 VFMA_M0xN0(C, a, b0, c);
1911 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
1912 VFMA_M0xN0(D, a, b0, c);
1913 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
1914 VFMA_M0xN0(E, a, b0, c);
1915 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
1916 VFMA_M0xN0(F, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001917#endif // K0 > 8
1918
1919 lhs_offset += K0 * sizeof(DATA_TYPE);
1920 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
1921 }
1922
1923 // Left-over accumulations
1924 for(; i < K; ++i)
1925 {
1926 // Load values from LHS matrix
1927 VEC_DATA_TYPE(DATA_TYPE, 2)
1928 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
1929#if M0 > 1
1930 VEC_DATA_TYPE(DATA_TYPE, 2)
1931 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
1932#endif // M0 > 1
1933#if M0 > 2
1934 VEC_DATA_TYPE(DATA_TYPE, 2)
1935 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
1936#endif // M0 > 2
1937#if M0 > 3
1938 VEC_DATA_TYPE(DATA_TYPE, 2)
1939 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
1940#endif // M0 > 3
1941#if M0 > 4
1942 VEC_DATA_TYPE(DATA_TYPE, 2)
1943 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
1944#endif // M0 > 4
1945#if M0 > 5
1946 VEC_DATA_TYPE(DATA_TYPE, 2)
1947 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
1948#endif // M0 > 5
1949#if M0 > 6
1950 VEC_DATA_TYPE(DATA_TYPE, 2)
1951 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
1952#endif // M0 > 6
1953#if M0 > 7
1954 VEC_DATA_TYPE(DATA_TYPE, 2)
giuros01b3204e72019-04-01 13:50:22 +01001955 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001956#endif // M0 > 7
1957
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001958 VEC_DATA_TYPE(DATA_TYPE, N0)
1959 b0;
1960
1961 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1962 VFMA_M0xN0(0, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001963
1964 lhs_offset += sizeof(DATA_TYPE);
1965 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
1966 }
1967
SiCong Li406a13f2020-07-15 12:09:58 +01001968 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001969
1970 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1971
1972#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001973 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001974 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001975
1976 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1977 // multiply dst_stride_z by DEPTH_GEMM3D
1978 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1979
1980#else // defined(REINTERPRET_OUTPUT_AS_3D)
1981
1982 // Add offset for batched GEMM
1983 dst_addr += z * dst_stride_z;
1984
1985#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1986
1987 // Multiply by the weight of matrix-matrix product and store the result
1988#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001989 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001990#endif // defined(ALPHA)
1991
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001992 // Add beta*bias
1993#if defined(BETA)
1994#if defined(BROADCAST_BIAS)
1995 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1996
1997 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1998
1999#ifndef UNIT_BETA
2000 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2001#endif // UNIT_BIAS
2002
2003 // c = c + bias[broadcasted]
2004 ADD_BLOCK_BROADCAST(M0, c, bias0);
2005
2006#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002007 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01002008
2009 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2010
2011#ifndef UNIT_BETA
2012 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2013#endif // UNIT_BIAS
2014
2015 // c = c + bias
2016 ADD_BLOCK(M0, c, bias);
2017
2018#endif // defined(BROADCAST_BIAS)
2019#endif // defined(BETA)
2020
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002021#if defined(ACTIVATION_TYPE)
2022 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2023#endif // defined(ACTIVATION_TYPE)
2024
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002025 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01002026 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002027
2028#undef RHS_BLOCK_SIZE
2029#undef RHS_OFFSET_X
2030#undef RHS_STEP_X
2031}
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002032
2033#if defined(OPENCL_IMAGE_SUPPORT)
2034/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2035 * The LHS matrix is NOT reshaped
2036 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
2037 *
2038 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2039 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2040 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
2041 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2042 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2043 * could be different from the value returned by get_image_height(rhs_img).
2044 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
2045 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
2046 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2047 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01002048 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2049 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002050 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2051 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
2052 * - N0 = 4, 8, 16
2053 * - K0 = 4, 8, 16
2054 * - H0 >= 1
2055 *
2056 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2057 * The activation function is performed after the bias addition
2058 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
2059 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
2060 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2061 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2062 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2063 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
2064 *
2065 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
2066 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
2067 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2068 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
2069 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2070 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
2071 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2072 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2073 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2074 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2075 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2076 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2077 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2078 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2079 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2080 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2081 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2082 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2083 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2084 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
2085 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2086 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2087 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2088 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
2089 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2090 */
2091__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
2092 __read_only image2d_t rhs_img,
2093#if defined(BETA)
2094 IMAGE_DECLARATION(bias),
2095#endif // defined(BETA)
2096 IMAGE_DECLARATION(dst),
2097 uint lhs_stride_z,
2098 uint rhs_stride_z,
2099#if defined(BETA)
2100 uint bias_stride_z,
2101#endif //defined(BETA)
2102 uint dst_stride_z
2103#if defined(REINTERPRET_INPUT_AS_3D)
2104 ,
2105 uint lhs_cross_plane_pad
2106#endif // REINTERPRET_INPUT_AS_3D
2107#if defined(REINTERPRET_OUTPUT_AS_3D)
2108 ,
2109 uint dst_cross_plane_pad
2110#endif // REINTERPRET_OUTPUT_AS_3D
2111 )
2112{
2113 // Pixel unit
2114#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
2115
2116 // Block size
2117#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
2118
2119 // RHS offset and step X
2120#if defined(RHS_INTERLEAVE)
2121#define RHS_OFFSET_X (PIXEL_UNIT)
2122#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
2123#else // defined(RHS_INTERLEAVE)
2124#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2125#define RHS_STEP_X (PIXEL_UNIT)
2126#endif // defined(RHS_INTERLEAVE)
2127
2128 uint x = get_global_id(0);
2129 uint y = get_global_id(1);
2130 uint z = get_global_id(2);
2131
2132#if defined(DUMMY_WORK_ITEMS)
2133 if((x * N0 >= N) || (y * M0 >= M))
2134 {
2135 return;
2136 }
2137#endif // defined(DUMMY_WORK_ITEMS)
2138
2139 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01002140 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002141
2142#if defined(MATRIX_B_DEPTH)
2143 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2144 const uint z_rhs = (z % MATRIX_B_DEPTH);
2145#else // defined(MATRIX_B_DEPTH)
2146 const uint z_rhs = z;
2147#endif // defined(MATRIX_B_DEPTH)
2148
2149 // Compute RHS matrix coordinates
2150 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
2151 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
2152
2153 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
2154 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2155
2156#if defined(REINTERPRET_INPUT_AS_3D)
2157
2158 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2159 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
2160
2161 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2162 // multiply lhs_stride_z by DEPTH_GEMM3D
2163 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
2164
2165#else // defined(REINTERPRET_INPUT_AS_3D)
2166
2167 // Add offset for batched GEMM
2168 lhs_offset += z * lhs_stride_z;
2169
2170#endif // defined(REINTERPRET_INPUT_AS_3D)
2171
2172 // Initialize the accumulators
2173 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
2174
2175 int i = 0;
2176 for(; i <= (K - K0); i += K0)
2177 {
2178 // Load values from LHS matrix
2179 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
2180
2181 VEC_DATA_TYPE(DATA_TYPE, N0)
2182 b0;
2183
2184 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2185 VFMA_M0xN0(0, a, b0, c);
2186 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
2187 VFMA_M0xN0(1, a, b0, c);
2188#if K0 > 2
2189 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
2190 VFMA_M0xN0(2, a, b0, c);
2191#endif // K0 > 2
2192#if K0 > 3
2193 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
2194 VFMA_M0xN0(3, a, b0, c);
2195#endif // K0 > 3
2196#if K0 > 4
2197 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
2198 VFMA_M0xN0(4, a, b0, c);
2199 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
2200 VFMA_M0xN0(5, a, b0, c);
2201 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
2202 VFMA_M0xN0(6, a, b0, c);
2203 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
2204 VFMA_M0xN0(7, a, b0, c);
2205#endif // K0 > 4
2206#if K0 > 8
2207 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
2208 VFMA_M0xN0(8, a, b0, c);
2209 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
2210 VFMA_M0xN0(9, a, b0, c);
2211 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
2212 VFMA_M0xN0(A, a, b0, c);
2213 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
2214 VFMA_M0xN0(B, a, b0, c);
2215 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
2216 VFMA_M0xN0(C, a, b0, c);
2217 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
2218 VFMA_M0xN0(D, a, b0, c);
2219 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
2220 VFMA_M0xN0(E, a, b0, c);
2221 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
2222 VFMA_M0xN0(F, a, b0, c);
2223#endif // K0 > 8
2224
2225 lhs_offset += K0 * sizeof(DATA_TYPE);
2226 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
2227 }
2228
2229 // Left-over accumulations
2230 for(; i < K; ++i)
2231 {
2232 // Load values from LHS matrix
2233 VEC_DATA_TYPE(DATA_TYPE, 2)
2234 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
2235#if M0 > 1
2236 VEC_DATA_TYPE(DATA_TYPE, 2)
2237 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
2238#endif // M0 > 1
2239#if M0 > 2
2240 VEC_DATA_TYPE(DATA_TYPE, 2)
2241 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
2242#endif // M0 > 2
2243#if M0 > 3
2244 VEC_DATA_TYPE(DATA_TYPE, 2)
2245 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
2246#endif // M0 > 3
2247#if M0 > 4
2248 VEC_DATA_TYPE(DATA_TYPE, 2)
2249 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
2250#endif // M0 > 4
2251#if M0 > 5
2252 VEC_DATA_TYPE(DATA_TYPE, 2)
2253 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
2254#endif // M0 > 5
2255#if M0 > 6
2256 VEC_DATA_TYPE(DATA_TYPE, 2)
2257 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
2258#endif // M0 > 6
2259#if M0 > 7
2260 VEC_DATA_TYPE(DATA_TYPE, 2)
2261 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
2262#endif // M0 > 7
2263
2264 VEC_DATA_TYPE(DATA_TYPE, N0)
2265 b0;
2266 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2267
2268 VFMA_M0xN0(0, a, b0, c);
2269
2270 lhs_offset += sizeof(DATA_TYPE);
2271 x_rhs += RHS_STEP_X;
2272 }
2273
SiCong Li406a13f2020-07-15 12:09:58 +01002274 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002275
2276 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
2277
2278#if defined(REINTERPRET_OUTPUT_AS_3D)
2279 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2280 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2281
2282 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2283 // multiply dst_stride_z by DEPTH_GEMM3D
2284 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2285
2286#else // defined(REINTERPRET_OUTPUT_AS_3D)
2287
2288 // Add offset for batched GEMM
2289 dst_addr += z * dst_stride_z;
2290
2291#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2292
2293 // Multiply by the weight of matrix-matrix product and store the result
2294#if defined(ALPHA)
2295 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2296#endif // defined(ALPHA)
2297
2298 // Add beta*bias
2299#if defined(BETA)
2300#if defined(BROADCAST_BIAS)
2301 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2302
2303 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2304
2305#ifndef UNIT_BETA
2306 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2307#endif // UNIT_BIAS
2308
2309 // c = c + bias[broadcasted]
2310 ADD_BLOCK_BROADCAST(M0, c, bias0);
2311
2312#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002313 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002314
2315 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2316
2317#ifndef UNIT_BETA
2318 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2319#endif // UNIT_BIAS
2320
2321 // c = c + bias
2322 ADD_BLOCK(M0, c, bias);
2323
2324#endif // defined(BROADCAST_BIAS)
2325#endif // defined(BETA)
2326
2327#if defined(ACTIVATION_TYPE)
2328 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2329#endif // defined(ACTIVATION_TYPE)
2330
2331 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01002332 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002333
2334#undef RHS_BLOCK_SIZE
2335#undef RHS_OFFSET_X
2336#undef RHS_STEP_X
2337}
2338#endif // defined(OPENCL_IMAGE_SUPPORT)
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002339#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002340
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002341#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002342
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002343#if defined(MIXED_PRECISION)
2344#if K0 == 2
2345#define ARM_DOT_K0(a, b, c) \
2346 ({ \
2347 c += a.s0 * b.s0; \
2348 c += a.s1 * b.s1; \
2349 })
2350#elif K0 == 3 // K0 == 3
2351#define ARM_DOT_K0(a, b, c) \
2352 ({ \
2353 c += a.s0 * b.s0; \
2354 c += a.s1 * b.s1; \
2355 c += a.s2 * b.s2; \
2356 })
2357#elif K0 == 4 // K0 == 4
2358#define ARM_DOT_K0(a, b, c) \
2359 ({ \
2360 c += a.s0 * b.s0; \
2361 c += a.s1 * b.s1; \
2362 c += a.s2 * b.s2; \
2363 c += a.s3 * b.s3; \
2364 })
2365#elif K0 == 8 // K0 == 8
2366#define ARM_DOT_K0(a, b, c) \
2367 ({ \
2368 c += a.s0 * b.s0; \
2369 c += a.s1 * b.s1; \
2370 c += a.s2 * b.s2; \
2371 c += a.s3 * b.s3; \
2372 c += a.s4 * b.s4; \
2373 c += a.s5 * b.s5; \
2374 c += a.s6 * b.s6; \
2375 c += a.s7 * b.s7; \
2376 })
2377#elif K0 == 16 // K0 == 16
2378#define ARM_DOT_K0(a, b, c) \
2379 ({ \
2380 c += a.s0 * b.s0; \
2381 c += a.s1 * b.s1; \
2382 c += a.s2 * b.s2; \
2383 c += a.s3 * b.s3; \
2384 c += a.s4 * b.s4; \
2385 c += a.s5 * b.s5; \
2386 c += a.s6 * b.s6; \
2387 c += a.s7 * b.s7; \
2388 c += a.s8 * b.s8; \
2389 c += a.s9 * b.s9; \
2390 c += a.sA * b.sA; \
2391 c += a.sB * b.sB; \
2392 c += a.sC * b.sC; \
2393 c += a.sD * b.sD; \
2394 c += a.sE * b.sE; \
2395 c += a.sF * b.sF; \
2396 })
2397#else // K0 not supported
2398#error "K0 value not supported"
2399#endif // K0 conditions
2400#else // defined(MIXED_PRECISION)
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002401#if K0 == 2
2402#define ARM_DOT_K0(a, b, c) \
2403 ({ \
2404 c = fma(a.s0, b.s0, c); \
2405 c = fma(a.s1, b.s1, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002406 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002407#elif K0 == 3 // K0 == 3
2408#define ARM_DOT_K0(a, b, c) \
2409 ({ \
2410 c = fma(a.s0, b.s0, c); \
2411 c = fma(a.s1, b.s1, c); \
2412 c = fma(a.s2, b.s2, c); \
2413 })
2414#elif K0 == 4 // K0 == 4
2415#define ARM_DOT_K0(a, b, c) \
2416 ({ \
2417 c = fma(a.s0, b.s0, c); \
2418 c = fma(a.s1, b.s1, c); \
2419 c = fma(a.s2, b.s2, c); \
2420 c = fma(a.s3, b.s3, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002421 })
2422#elif K0 == 8 // K0 == 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002423#define ARM_DOT_K0(a, b, c) \
2424 ({ \
2425 c = fma(a.s0, b.s0, c); \
2426 c = fma(a.s1, b.s1, c); \
2427 c = fma(a.s2, b.s2, c); \
2428 c = fma(a.s3, b.s3, c); \
2429 c = fma(a.s4, b.s4, c); \
2430 c = fma(a.s5, b.s5, c); \
2431 c = fma(a.s6, b.s6, c); \
2432 c = fma(a.s7, b.s7, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002433 })
2434#elif K0 == 16 // K0 == 16
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002435#define ARM_DOT_K0(a, b, c) \
2436 ({ \
2437 c = fma(a.s0, b.s0, c); \
2438 c = fma(a.s1, b.s1, c); \
2439 c = fma(a.s2, b.s2, c); \
2440 c = fma(a.s3, b.s3, c); \
2441 c = fma(a.s4, b.s4, c); \
2442 c = fma(a.s5, b.s5, c); \
2443 c = fma(a.s6, b.s6, c); \
2444 c = fma(a.s7, b.s7, c); \
2445 c = fma(a.s8, b.s8, c); \
2446 c = fma(a.s9, b.s9, c); \
2447 c = fma(a.sA, b.sA, c); \
2448 c = fma(a.sB, b.sB, c); \
2449 c = fma(a.sC, b.sC, c); \
2450 c = fma(a.sD, b.sD, c); \
2451 c = fma(a.sE, b.sE, c); \
2452 c = fma(a.sF, b.sF, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002453 })
2454#else // K0 not supported
2455#error "K0 value not supported"
2456#endif // K0 conditions
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002457#endif // defined(MIXED_PRECISION)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002458
2459#if N0 == 2
2460#define ARM_DOT_K0XN0(a, b, c) \
2461 ({ \
2462 ARM_DOT_K0((a), (b##0), (c.s0)); \
2463 ARM_DOT_K0((a), (b##1), (c.s1)); \
2464 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002465#elif N0 == 3 // N0 == 3
2466#define ARM_DOT_K0XN0(a, b, c) \
2467 ({ \
2468 ARM_DOT_K0((a), (b##0), (c.s0)); \
2469 ARM_DOT_K0((a), (b##1), (c.s1)); \
2470 ARM_DOT_K0((a), (b##2), (c.s2)); \
2471 })
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002472#elif N0 == 4 // N0 == 4
2473#define ARM_DOT_K0XN0(a, b, c) \
2474 ({ \
2475 ARM_DOT_K0((a), (b##0), (c.s0)); \
2476 ARM_DOT_K0((a), (b##1), (c.s1)); \
2477 ARM_DOT_K0((a), (b##2), (c.s2)); \
2478 ARM_DOT_K0((a), (b##3), (c.s3)); \
2479 })
2480#elif N0 == 8 // N0 == 8
2481#define ARM_DOT_K0XN0(a, b, c) \
2482 ({ \
2483 ARM_DOT_K0((a), (b##0), (c.s0)); \
2484 ARM_DOT_K0((a), (b##1), (c.s1)); \
2485 ARM_DOT_K0((a), (b##2), (c.s2)); \
2486 ARM_DOT_K0((a), (b##3), (c.s3)); \
2487 ARM_DOT_K0((a), (b##4), (c.s4)); \
2488 ARM_DOT_K0((a), (b##5), (c.s5)); \
2489 ARM_DOT_K0((a), (b##6), (c.s6)); \
2490 ARM_DOT_K0((a), (b##7), (c.s7)); \
2491 })
2492#elif N0 == 16 // N0 == 16
2493#define ARM_DOT_K0XN0(a, b, c) \
2494 ({ \
2495 ARM_DOT_K0((a), (b##0), (c.s0)); \
2496 ARM_DOT_K0((a), (b##1), (c.s1)); \
2497 ARM_DOT_K0((a), (b##2), (c.s2)); \
2498 ARM_DOT_K0((a), (b##3), (c.s3)); \
2499 ARM_DOT_K0((a), (b##4), (c.s4)); \
2500 ARM_DOT_K0((a), (b##5), (c.s5)); \
2501 ARM_DOT_K0((a), (b##6), (c.s6)); \
2502 ARM_DOT_K0((a), (b##7), (c.s7)); \
2503 ARM_DOT_K0((a), (b##8), (c.s8)); \
2504 ARM_DOT_K0((a), (b##9), (c.s9)); \
2505 ARM_DOT_K0((a), (b##A), (c.sA)); \
2506 ARM_DOT_K0((a), (b##B), (c.sB)); \
2507 ARM_DOT_K0((a), (b##C), (c.sC)); \
2508 ARM_DOT_K0((a), (b##D), (c.sD)); \
2509 ARM_DOT_K0((a), (b##E), (c.sE)); \
2510 ARM_DOT_K0((a), (b##F), (c.sF)); \
2511 })
2512#else // N0 not supported
2513#error "N0 value not supported"
2514#endif // N0 conditions
2515
2516/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2517 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2518 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2519 *
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002520 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2521 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2522 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002523 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002524 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002525 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2526 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2527 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002528 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2529 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
2530 * @note Only the following configurations of M0, N0 and K0 are currently supported:
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01002531 * - M0 = 2, 3, 4, 5, 6, 7, 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002532 * - N0 = 2, 3, 4, 8, 16
2533 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00002534 * - V0 >= 1
2535 * - H0 >= 1
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002536 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002537 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002538 * The activation function is performed after the bias addition
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002539 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002540 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2541 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2542 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2543 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2544 *
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002545 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
2546 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2547 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2548 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2549 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2550 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2551 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
2552 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
2553 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2554 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
2555 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2556 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
2557 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2558 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2559 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2560 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2561 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2562 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2563 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2564 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2565 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2566 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2567 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2568 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002569 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002570 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2571 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2572 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2573 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2574 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002575 */
2576__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
2577 IMAGE_DECLARATION(rhs),
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002578#if defined(BETA)
2579 IMAGE_DECLARATION(bias),
2580#endif // defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002581 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002582 uint k,
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002583 uint lhs_stride_z,
2584 uint rhs_stride_z,
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002585#if defined(BETA)
2586 uint bias_stride_z,
2587#endif //defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002588 uint dst_stride_z
2589#if defined(REINTERPRET_OUTPUT_AS_3D)
2590 ,
2591 uint dst_cross_plane_pad
2592#endif // REINTERPRET_OUTPUT_AS_3D
2593 )
2594{
2595 // Block size
2596#define LHS_BLOCK_SIZE ((K0) * (M0))
2597
2598#if defined(LHS_INTERLEAVE)
2599#define LHS_OFFSET_X (K0)
2600#define LHS_STEP_X ((K0) * (V0))
2601#define LHS_STEP_LOOP (1)
2602#else // defined(INTERLEAVE)
2603#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2604#define LHS_STEP_X (K0)
2605#define LHS_STEP_LOOP (V0)
2606#endif // defined(INTERLEAVE)
2607
2608 // Block size
2609#define RHS_BLOCK_SIZE ((K0) * (N0))
2610
2611 // RHS offset and step X
2612#if defined(RHS_INTERLEAVE)
2613#define RHS_OFFSET_X (K0)
2614#define RHS_STEP_X ((K0) * (H0))
2615#define RHS_STEP_LOOP (1)
2616#else // defined(RHS_INTERLEAVE)
2617#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2618#define RHS_STEP_X (K0)
2619#define RHS_STEP_LOOP (H0)
2620#endif // defined(RHS_INTERLEAVE)
2621
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002622#if defined(DUMMY_WORK_ITEMS)
2623 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2624 {
2625 return;
2626 }
2627#endif // defined(DUMMY_WORK_ITEMS)
2628
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002629 // Compute LHS matrix address
2630 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2631 (get_global_id(2) * lhs_stride_z);
2632
2633 // Compute RHS matrix address
2634 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
2635
2636#if defined(MATRIX_B_DEPTH)
2637 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2638 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
2639#else // defined(MATRIX_B_DEPTH)
2640 rhs_addr += get_global_id(2) * rhs_stride_z;
2641#endif // defined(MATRIX_B_DEPTH)
2642
2643 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002644 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002645
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002646 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2647 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Usama Arif0681e3b2019-04-25 14:28:07 +01002648
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002649 for(int i = 0; i < k; i += K0)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002650 {
2651 // Supported cases (M0, K0):
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002652 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
2653 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
2654 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
2655 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
2656 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
2657 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
2658 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
2659 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002660 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01002661 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002662
2663 // Load values from RHS matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002664 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002665
2666 // Accumulate
2667 ARM_DOT_K0XN0(a0, b, c0);
2668#if M0 > 1
2669 ARM_DOT_K0XN0(a1, b, c1);
2670#endif // M0 > 1
2671#if M0 > 2
2672 ARM_DOT_K0XN0(a2, b, c2);
2673#endif // M0 > 2
2674#if M0 > 3
2675 ARM_DOT_K0XN0(a3, b, c3);
2676#endif // M0 > 3
2677#if M0 > 4
2678 ARM_DOT_K0XN0(a4, b, c4);
2679#endif // M0 > 4
2680#if M0 > 5
2681 ARM_DOT_K0XN0(a5, b, c5);
2682#endif // M0 > 5
2683#if M0 > 6
2684 ARM_DOT_K0XN0(a6, b, c6);
2685#endif // M0 > 6
2686#if M0 > 7
2687 ARM_DOT_K0XN0(a7, b, c7);
2688#endif // M0 > 7
2689
2690 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2691 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
2692 }
2693
2694 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2695
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002696 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002697
2698#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002699
2700 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01002701 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002702 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2703 // multiply dst_stride_z by DEPTH_GEMM3D
2704 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2705
2706#else // defined(REINTERPRET_OUTPUT_AS_3D)
2707
2708 // Add offset for batched GEMM
2709 dst_addr += get_global_id(2) * dst_stride_z;
2710
2711#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2712
2713 // Multiply by the weight of matrix-matrix product and store the result
2714#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01002715 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002716#endif // defined(ALPHA)
2717
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002718 // Add beta*bias
2719#if defined(BETA)
2720#if defined(BROADCAST_BIAS)
2721 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2722
2723 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2724
2725#ifndef UNIT_BETA
2726 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2727#endif // UNIT_BIAS
2728
2729 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002730#if defined(MIXED_PRECISION)
2731 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2732 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2733#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002734 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002735#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002736
2737#else // defined(BROADCAST_BIAS)
2738 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2739 2) * bias_stride_z;
2740
2741 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2742
2743#ifndef UNIT_BETA
2744 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2745#endif // UNIT_BIAS
2746
2747 // c = c + bias
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002748#if defined(MIXED_PRECISION)
2749 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2750 ADD_BLOCK(M0, c, bias_hp);
2751#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002752 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002753#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002754
2755#endif // defined(BROADCAST_BIAS)
2756#endif // defined(BETA)
2757
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002758#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002759#if defined(MIXED_PRECISION)
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002760 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002761#else // defined(MIXED_PRECISION)
2762 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2763#endif // defined(MIXED_PRECISION)
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002764#endif // defined(ACTIVATION_TYPE)
2765
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002766 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002767#if defined(MIXED_PRECISION)
2768 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
2769#else // defined(MIXED_PRECISION)
Usama Arif0681e3b2019-04-25 14:28:07 +01002770 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002771#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002772
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002773#undef LHS_BLOCK_SIZE
2774#undef LHS_OFFSET_X
2775#undef LHS_STEP_X
2776#undef RHS_BLOCK_SIZE
2777#undef RHS_OFFSET_X
2778#undef RHS_STEP_X
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002779#undef LHS_STEP_LOOP
2780#undef RHS_STEP_LOOP
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002781}
giuros01b3204e72019-04-01 13:50:22 +01002782
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002783#if defined(OPENCL_IMAGE_SUPPORT)
2784/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
2785 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2786 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2787 *
2788 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2789 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2790 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2791 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
2792 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2793 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002794 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2795 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2796 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002797 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2798 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2799 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2800 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2801 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
2802 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2803 * - M0 = 2, 3, 4, 5, 6, 7, 8
2804 * - N0 = 4, 8, 16
2805 * - K0 = 4, 8, 16
2806 * - V0 >= 1
2807 * - H0 >= 1
2808 *
2809 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2810 * The activation function is performed after the bias addition
2811 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
2812 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2813 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2814 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2815 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2816 *
2817 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
2818 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2819 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2820 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2821 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2822 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2823 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2824 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2825 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2826 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2827 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2828 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2829 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2830 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2831 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2832 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2833 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2834 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2835 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002836 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002837 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2838 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2839 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2840 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2841 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2842 */
2843__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
2844 __read_only image2d_t rhs_img,
2845#if defined(BETA)
2846 IMAGE_DECLARATION(bias),
2847#endif // defined(BETA)
2848 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002849 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002850 uint lhs_stride_z,
2851 uint rhs_stride_z,
2852#if defined(BETA)
2853 uint bias_stride_z,
2854#endif //defined(BETA)
2855 uint dst_stride_z
2856#if defined(REINTERPRET_OUTPUT_AS_3D)
2857 ,
2858 uint dst_cross_plane_pad
2859#endif // REINTERPRET_OUTPUT_AS_3D
2860 )
2861{
2862 // Pixel unit
2863#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
2864
2865 // Block size
2866#define LHS_BLOCK_SIZE ((K0) * (M0))
2867
2868#if defined(LHS_INTERLEAVE)
2869#define LHS_OFFSET_X (K0)
2870#define LHS_STEP_X ((K0) * (V0))
2871#define LHS_STEP_LOOP (1)
2872#else // defined(INTERLEAVE)
2873#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2874#define LHS_STEP_X (K0)
2875#define LHS_STEP_LOOP (V0)
2876#endif // defined(INTERLEAVE)
2877
2878 // Block size
2879#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
2880
2881 // RHS offset and step X
2882#if defined(RHS_INTERLEAVE)
2883#define RHS_OFFSET_X (PIXEL_UNIT)
2884#define RHS_STEP_X (PIXEL_UNIT * (H0))
2885#define RHS_STEP_LOOP (1)
2886#else // defined(RHS_INTERLEAVE)
2887#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2888#define RHS_STEP_X PIXEL_UNIT
2889#define RHS_STEP_LOOP (H0)
2890#endif // defined(RHS_INTERLEAVE)
2891
2892#if defined(DUMMY_WORK_ITEMS)
2893 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2894 {
2895 return;
2896 }
2897#endif // defined(DUMMY_WORK_ITEMS)
2898
2899 // Compute LHS matrix address
2900 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2901 (get_global_id(2) * lhs_stride_z);
2902
2903#if defined(MATRIX_B_DEPTH)
2904 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2905 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
2906#else // defined(MATRIX_B_DEPTH)
2907 const uint z_rhs = get_global_id(2);
2908#endif // defined(MATRIX_B_DEPTH)
2909
2910 // Compute RHS matrix coordinates
2911 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
2912 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
2913
2914 // Initialize the accumulators
2915 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
2916
2917 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2918 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2919
2920 for(int i = 0; i < K; i += K0)
2921 {
2922 // Load values from LHS matrix
2923 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
2924
2925 // Load values from RHS matrix stored in a cl_image
2926 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
2927 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
2928
2929 // Accumulate
2930 ARM_DOT_K0XN0(a0, b, c0);
2931#if M0 > 1
2932 ARM_DOT_K0XN0(a1, b, c1);
2933#endif // M0 > 1
2934#if M0 > 2
2935 ARM_DOT_K0XN0(a2, b, c2);
2936#endif // M0 > 2
2937#if M0 > 3
2938 ARM_DOT_K0XN0(a3, b, c3);
2939#endif // M0 > 3
2940#if M0 > 4
2941 ARM_DOT_K0XN0(a4, b, c4);
2942#endif // M0 > 4
2943#if M0 > 5
2944 ARM_DOT_K0XN0(a5, b, c5);
2945#endif // M0 > 5
2946#if M0 > 6
2947 ARM_DOT_K0XN0(a6, b, c6);
2948#endif // M0 > 6
2949#if M0 > 7
2950 ARM_DOT_K0XN0(a7, b, c7);
2951#endif // M0 > 7
2952
2953 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2954
2955 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
2956 }
2957
2958 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2959
2960 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
2961
2962#if defined(REINTERPRET_OUTPUT_AS_3D)
2963
2964 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2965 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2966 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2967 // multiply dst_stride_z by DEPTH_GEMM3D
2968 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2969
2970#else // defined(REINTERPRET_OUTPUT_AS_3D)
2971
2972 // Add offset for batched GEMM
2973 dst_addr += get_global_id(2) * dst_stride_z;
2974
2975#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2976
2977 // Multiply by the weight of matrix-matrix product and store the result
2978#if defined(ALPHA)
2979 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2980#endif // defined(ALPHA)
2981
2982 // Add beta*bias
2983#if defined(BETA)
2984#if defined(BROADCAST_BIAS)
2985 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2986
2987 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2988
2989#ifndef UNIT_BETA
2990 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2991#endif // UNIT_BIAS
2992
2993 // c = c + bias[broadcasted]
2994#if defined(MIXED_PRECISION)
2995 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2996 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2997#else // defined(MIXED_PRECISION)
2998 ADD_BLOCK_BROADCAST(M0, c, bias0);
2999#endif // defined(MIXED_PRECISION)
3000
3001#else // defined(BROADCAST_BIAS)
3002 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3003 2) * bias_stride_z;
3004
3005 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3006
3007#ifndef UNIT_BETA
3008 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3009#endif // UNIT_BIAS
3010
3011 // c = c + bias
3012#if defined(MIXED_PRECISION)
3013 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3014 ADD_BLOCK(M0, c, bias_hp);
3015#else // defined(MIXED_PRECISION)
3016 ADD_BLOCK(M0, c, bias);
3017#endif // defined(MIXED_PRECISION)
3018
3019#endif // defined(BROADCAST_BIAS)
3020#endif // defined(BETA)
3021
3022#if defined(ACTIVATION_TYPE)
3023#if defined(MIXED_PRECISION)
3024 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3025#else // defined(MIXED_PRECISION)
3026 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
3027#endif // defined(MIXED_PRECISION)
3028#endif // defined(ACTIVATION_TYPE)
3029
3030 // Store output block
3031#if defined(MIXED_PRECISION)
3032 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3033#else // defined(MIXED_PRECISION)
3034 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3035#endif // defined(MIXED_PRECISION)
3036
3037#undef LHS_BLOCK_SIZE
3038#undef LHS_OFFSET_X
3039#undef LHS_STEP_X
3040#undef RHS_BLOCK_SIZE
3041#undef RHS_OFFSET_X
3042#undef RHS_STEP_X
3043#undef PIXEL_UNIT
3044#undef LHS_STEP_LOOP
3045#undef RHS_STEP_LOOP
3046}
3047#endif // defined(OPENCL_IMAGE_SUPPORT)
3048
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003049#if defined(LHS_TRANSPOSE)
3050
3051#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
3052
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003053#if defined(MIXED_PRECISION)
3054
3055#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3056#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003057#else // GPU_ARCH == GPU_ARCH_MIDGARD
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003058#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003059#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3060
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003061#else // defined(MIXED_PRECISION
3062
3063#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3064#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
3065#else // GPU_ARCH == GPU_ARCH_MIDGARD
3066#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
3067#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3068
3069#endif // defined(MIXED_PRECISION)
3070
3071#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
3072 ({ \
3073 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003074 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003075#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
3076 ({ \
3077 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
3078 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003079 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003080#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
3081 ({ \
3082 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
3083 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003084 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003085#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
3086 ({ \
3087 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
3088 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003089 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003090#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
3091 ({ \
3092 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
3093 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
3094 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
3095 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
3096 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003097 })
3098
3099// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
3100// a is the column-vector (transposed)
3101// b is the row-vector (not transposed)
3102// C is the output matrix
3103// Lower case is a vector (a, b)
3104// Upper case is a matrix (C)
3105#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
3106
3107#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
3108 ({ \
3109 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
3110 })
3111#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
3112 ({ \
3113 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
3114 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
3115 })
3116#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
3117 ({ \
3118 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
3119 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
3120 })
3121#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
3122 ({ \
3123 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
3124 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
3125 })
3126#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
3127 ({ \
3128 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
3129 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
3130 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
3131 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
3132 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
3133 })
3134#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
3135 ({ \
3136 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
3137 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
3138 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
3139 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
3140 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
3141 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
3142 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
3143 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
3144 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
3145 })
3146
3147// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
3148// The dimensions for this matrix multiplications are defined through M0, N0 and K0
3149// The dimensions supported are:
3150// M0: 1, 2, 3, 4, 8
3151// N0: 1, 2, 3, 4, 8, 16
3152// K0: 1, 2, 3, 4, 8, 16
3153// This macro calls the vector-by-matrix macro K0 times
3154// A, B and C are matrices
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003155#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
3156 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003157 (M0, N0, TYPE, A, B, C)
3158
3159/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3160 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3161 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3162 *
3163 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
3164 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003165 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003166 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3167 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3168 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3169 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3170 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
3171 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3172 * - M0 = 2, 3, 4, 8
3173 * - N0 = 2, 3, 4, 8, 16
3174 * - K0 = 2, 3, 4, 8, 16
3175 * - V0 >= 1
3176 * - H0 >= 1
3177 *
3178 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3179 * The activation function is performed after the bias addition
3180 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3181 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3182 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3183 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3184 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3185 *
3186 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
3187 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3188 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3189 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3190 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3191 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3192 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
3193 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
3194 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3195 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
3196 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3197 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
3198 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3199 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3200 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3201 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3202 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3203 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3204 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3205 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3206 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3207 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3208 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3209 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003210 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003211 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3212 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3213 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3214 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3215 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3216 */
3217__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
3218 IMAGE_DECLARATION(rhs),
3219#if defined(BETA)
3220 IMAGE_DECLARATION(bias),
3221#endif // defined(BETA)
3222 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003223 uint k,
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003224 uint lhs_stride_z,
3225 uint rhs_stride_z,
3226#if defined(BETA)
3227 uint bias_stride_z,
3228#endif //defined(BETA)
3229 uint dst_stride_z
3230#if defined(REINTERPRET_OUTPUT_AS_3D)
3231 ,
3232 uint dst_cross_plane_pad
3233#endif // REINTERPRET_OUTPUT_AS_3D
3234 )
3235{
3236 // Block size
3237#define LHS_BLOCK_SIZE ((K0) * (M0))
3238
3239#if defined(LHS_INTERLEAVE)
3240#define LHS_OFFSET_X (M0)
3241#define LHS_STEP_X ((M0) * (V0))
3242#define LHS_STEP_LOOP (1)
3243#else // defined(INTERLEAVE)
3244#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3245#define LHS_STEP_X (M0)
3246#define LHS_STEP_LOOP (V0)
3247#endif // defined(INTERLEAVE)
3248
3249 // Block size
3250#define RHS_BLOCK_SIZE ((K0) * (N0))
3251
3252 // RHS offset and step X
3253#if defined(RHS_INTERLEAVE)
3254#define RHS_OFFSET_X (N0)
3255#define RHS_STEP_X ((N0) * (H0))
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003256#else // defined(RHS_INTERLEAVE)
3257#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3258#define RHS_STEP_X (N0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003259#endif // defined(RHS_INTERLEAVE)
3260
3261 const uint x = get_global_id(0);
3262 const uint y = get_global_id(1);
3263 const uint z = get_global_id(2);
3264
3265#if defined(DUMMY_WORK_ITEMS)
3266 if((x * N0 >= N) || (y * M0 >= M))
3267 {
3268 return;
3269 }
3270#endif // defined(DUMMY_WORK_ITEMS)
3271
3272 // Compute LHS matrix address
3273 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3274
3275 // Compute RHS matrix address
3276 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
3277
3278#if defined(MATRIX_B_DEPTH)
3279 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3280 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
3281#else // defined(MATRIX_B_DEPTH)
3282 rhs_addr += z * rhs_stride_z;
3283#endif // defined(MATRIX_B_DEPTH)
3284
3285 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003286 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003287
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003288 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3289
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003290 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3291 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
3292
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003293 for(int i = 0; i < k; i += K0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003294 {
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003295 VEC_DATA_TYPE(DATA_TYPE, M0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003296 a0;
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003297 VEC_DATA_TYPE(DATA_TYPE, N0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003298 b0;
3299
3300 a0 = VLOAD(M0)(0, lhs);
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003301 b0 = VLOAD(N0)(0, rhs);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003302
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003303 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003304
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003305 lhs += LHS_STEP_X;
3306 rhs += RHS_STEP_X;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003307
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003308#if K0 > 1
3309 a0 = VLOAD(M0)(0, lhs);
3310 b0 = VLOAD(N0)(0, rhs);
3311
3312 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3313
3314 lhs += LHS_STEP_X;
3315 rhs += RHS_STEP_X;
3316#endif // K0 > 1
3317
3318#if K0 > 2
3319 a0 = VLOAD(M0)(0, lhs);
3320 b0 = VLOAD(N0)(0, rhs);
3321
3322 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3323
3324 lhs += LHS_STEP_X;
3325 rhs += RHS_STEP_X;
3326#endif // K0 > 2
3327
3328#if K0 > 3
3329 a0 = VLOAD(M0)(0, lhs);
3330 b0 = VLOAD(N0)(0, rhs);
3331
3332 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3333
3334 lhs += LHS_STEP_X;
3335 rhs += RHS_STEP_X;
3336#endif // K0 > 3
3337
3338#if K0 > 4
3339 a0 = VLOAD(M0)(0, lhs);
3340 b0 = VLOAD(N0)(0, rhs);
3341
3342 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3343
3344 lhs += LHS_STEP_X;
3345 rhs += RHS_STEP_X;
3346
3347 a0 = VLOAD(M0)(0, lhs);
3348 b0 = VLOAD(N0)(0, rhs);
3349
3350 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3351
3352 lhs += LHS_STEP_X;
3353 rhs += RHS_STEP_X;
3354
3355 a0 = VLOAD(M0)(0, lhs);
3356 b0 = VLOAD(N0)(0, rhs);
3357
3358 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3359
3360 lhs += LHS_STEP_X;
3361 rhs += RHS_STEP_X;
3362
3363 a0 = VLOAD(M0)(0, lhs);
3364 b0 = VLOAD(N0)(0, rhs);
3365
3366 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3367
3368 lhs += LHS_STEP_X;
3369 rhs += RHS_STEP_X;
3370#endif // K0 > 4
3371
3372#if K0 > 8
3373 a0 = VLOAD(M0)(0, lhs);
3374 b0 = VLOAD(N0)(0, rhs);
3375
3376 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3377
3378 lhs += LHS_STEP_X;
3379 rhs += RHS_STEP_X;
3380
3381 a0 = VLOAD(M0)(0, lhs);
3382 b0 = VLOAD(N0)(0, rhs);
3383
3384 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3385
3386 lhs += LHS_STEP_X;
3387 rhs += RHS_STEP_X;
3388
3389 a0 = VLOAD(M0)(0, lhs);
3390 b0 = VLOAD(N0)(0, rhs);
3391
3392 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3393
3394 lhs += LHS_STEP_X;
3395 rhs += RHS_STEP_X;
3396
3397 a0 = VLOAD(M0)(0, lhs);
3398 b0 = VLOAD(N0)(0, rhs);
3399
3400 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3401
3402 lhs += LHS_STEP_X;
3403 rhs += RHS_STEP_X;
3404
3405 a0 = VLOAD(M0)(0, lhs);
3406 b0 = VLOAD(N0)(0, rhs);
3407
3408 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3409
3410 lhs += LHS_STEP_X;
3411 rhs += RHS_STEP_X;
3412
3413 a0 = VLOAD(M0)(0, lhs);
3414 b0 = VLOAD(N0)(0, rhs);
3415
3416 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3417
3418 lhs += LHS_STEP_X;
3419 rhs += RHS_STEP_X;
3420
3421 a0 = VLOAD(M0)(0, lhs);
3422 b0 = VLOAD(N0)(0, rhs);
3423
3424 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3425
3426 lhs += LHS_STEP_X;
3427 rhs += RHS_STEP_X;
3428
3429 a0 = VLOAD(M0)(0, lhs);
3430 b0 = VLOAD(N0)(0, rhs);
3431
3432 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3433
3434 lhs += LHS_STEP_X;
3435 rhs += RHS_STEP_X;
3436#endif // K0 > 8
3437
3438#ifndef LHS_INTERLEAVE
3439 lhs += (M0 * K0 * (V0 - 1));
3440#endif // LHS_INTERLEAVE
3441
3442#ifndef RHS_INTERLEAVE
3443 rhs += (N0 * K0 * (H0 - 1));
3444#endif // RHS_INTERLEAVE
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003445 }
3446
3447 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3448
3449 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3450
3451#if defined(REINTERPRET_OUTPUT_AS_3D)
3452
3453 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3454 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3455 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3456 // multiply dst_stride_z by DEPTH_GEMM3D
3457 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3458
3459#else // defined(REINTERPRET_OUTPUT_AS_3D)
3460
3461 // Add offset for batched GEMM
3462 dst_addr += z * dst_stride_z;
3463
3464#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3465
3466 // Multiply by the weight of matrix-matrix product and store the result
3467#if defined(ALPHA)
3468 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3469#endif // defined(ALPHA)
3470
3471 // Add beta*bias
3472#if defined(BETA)
3473#if defined(BROADCAST_BIAS)
3474 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3475
3476 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3477
3478#ifndef UNIT_BETA
3479 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3480#endif // UNIT_BIAS
3481
3482 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003483#if defined(MIXED_PRECISION)
3484 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3485 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3486#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003487 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003488#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003489
3490#else // defined(BROADCAST_BIAS)
3491 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3492
3493 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3494
3495#ifndef UNIT_BETA
3496 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3497#endif // UNIT_BIAS
3498
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003499#if defined(MIXED_PRECISION)
3500 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3501 ADD_BLOCK(M0, c, bias_hp);
3502#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003503 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003504#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003505
3506#endif // defined(BROADCAST_BIAS)
3507#endif // defined(BETA)
3508
3509#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003510#if defined(MIXED_PRECISION)
3511 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3512#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003513 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003514#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003515#endif // defined(ACTIVATION_TYPE)
3516
3517 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003518#if defined(MIXED_PRECISION)
3519 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3520#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003521 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003522#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003523
3524#undef LHS_BLOCK_SIZE
3525#undef LHS_OFFSET_X
3526#undef LHS_STEP_X
3527#undef RHS_BLOCK_SIZE
3528#undef RHS_OFFSET_X
3529#undef RHS_STEP_X
3530}
3531
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003532#if defined(OPENCL_IMAGE_SUPPORT)
3533/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
3534 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3535 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3536 *
3537 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
3538 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003539 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
3540 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01003541 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
3542 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
3543 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003544 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3545 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3546 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3547 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3548 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
3549 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3550 * - M0 = 2, 3, 4, 8
3551 * - N0 = 4, 8, 16
3552 * - K0 = 4, 8, 16
3553 * - V0 >= 1
3554 * - H0 >= 1
3555 *
3556 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3557 * The activation function is performed after the bias addition
3558 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3559 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3560 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3561 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3562 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3563 *
3564 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
3565 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3566 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3567 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3568 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3569 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3570 * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
3571 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3572 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3573 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3574 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3575 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3576 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3577 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3578 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3579 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3580 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3581 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3582 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003583 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003584 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3585 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3586 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3587 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3588 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3589 */
3590__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
3591 __read_only image2d_t rhs_img,
3592#if defined(BETA)
3593 IMAGE_DECLARATION(bias),
3594#endif // defined(BETA)
3595 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003596 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003597 uint lhs_stride_z,
3598 uint rhs_stride_z,
3599#if defined(BETA)
3600 uint bias_stride_z,
3601#endif //defined(BETA)
3602 uint dst_stride_z
3603#if defined(REINTERPRET_OUTPUT_AS_3D)
3604 ,
3605 uint dst_cross_plane_pad
3606#endif // REINTERPRET_OUTPUT_AS_3D
3607 )
3608{
3609 // Pixel unit
3610#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
3611
3612 // Block size
3613#define LHS_BLOCK_SIZE ((K0) * (M0))
3614
3615#if defined(LHS_INTERLEAVE)
3616#define LHS_OFFSET_X (M0)
3617#define LHS_STEP_X ((M0) * (V0))
3618#define LHS_STEP_LOOP (1)
3619#else // defined(INTERLEAVE)
3620#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3621#define LHS_STEP_X (M0)
3622#define LHS_STEP_LOOP (V0)
3623#endif // defined(INTERLEAVE)
3624
3625 // Block size
3626#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
3627
3628 // RHS offset and step X
3629#if defined(RHS_INTERLEAVE)
3630#define RHS_OFFSET_X (PIXEL_UNIT)
3631#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
3632#else // defined(RHS_INTERLEAVE)
3633#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3634#define RHS_STEP_X (PIXEL_UNIT)
3635#endif // defined(RHS_INTERLEAVE)
3636
3637 const uint x = get_global_id(0);
3638 const uint y = get_global_id(1);
3639 const uint z = get_global_id(2);
3640
3641#if defined(DUMMY_WORK_ITEMS)
3642 if((x * N0 >= N) || (y * M0 >= M))
3643 {
3644 return;
3645 }
3646#endif // defined(DUMMY_WORK_ITEMS)
3647
3648 // Compute LHS matrix address
3649 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3650
3651#if defined(MATRIX_B_DEPTH)
3652 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3653 const uint z_rhs = (z % MATRIX_B_DEPTH);
3654#else // defined(MATRIX_B_DEPTH)
3655 const uint z_rhs = z;
3656#endif // defined(MATRIX_B_DEPTH)
3657
3658 // Compute RHS matrix coordinates
3659 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
3660 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
3661
3662 // Initialize the accumulators
3663 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
3664
3665 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3666
3667 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3668
3669 for(int i = 0; i < K; i += K0)
3670 {
3671 VEC_DATA_TYPE(DATA_TYPE, M0)
3672 a0;
3673 VEC_DATA_TYPE(DATA_TYPE, N0)
3674 b0;
3675
3676 a0 = VLOAD(M0)(0, lhs);
3677 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
3678
3679 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3680
3681 lhs += LHS_STEP_X;
3682
3683#if K0 > 1
3684 a0 = VLOAD(M0)(0, lhs);
3685 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
3686
3687 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3688
3689 lhs += LHS_STEP_X;
3690#endif // K0 > 1
3691
3692#if K0 > 2
3693 a0 = VLOAD(M0)(0, lhs);
3694 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
3695
3696 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3697
3698 lhs += LHS_STEP_X;
3699#endif // K0 > 2
3700
3701#if K0 > 3
3702 a0 = VLOAD(M0)(0, lhs);
3703 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
3704
3705 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3706
3707 lhs += LHS_STEP_X;
3708#endif // K0 > 3
3709
3710#if K0 > 4
3711 a0 = VLOAD(M0)(0, lhs);
3712 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
3713
3714 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3715
3716 lhs += LHS_STEP_X;
3717
3718 a0 = VLOAD(M0)(0, lhs);
3719 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
3720
3721 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3722
3723 lhs += LHS_STEP_X;
3724
3725 a0 = VLOAD(M0)(0, lhs);
3726 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
3727
3728 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3729
3730 lhs += LHS_STEP_X;
3731
3732 a0 = VLOAD(M0)(0, lhs);
3733 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
3734
3735 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3736
3737 lhs += LHS_STEP_X;
3738#endif // K0 > 4
3739
3740#if K0 > 8
3741 a0 = VLOAD(M0)(0, lhs);
3742 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
3743
3744 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3745
3746 lhs += LHS_STEP_X;
3747
3748 a0 = VLOAD(M0)(0, lhs);
3749 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
3750
3751 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3752
3753 lhs += LHS_STEP_X;
3754
3755 a0 = VLOAD(M0)(0, lhs);
3756 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
3757
3758 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3759
3760 lhs += LHS_STEP_X;
3761
3762 a0 = VLOAD(M0)(0, lhs);
3763 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
3764
3765 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3766
3767 lhs += LHS_STEP_X;
3768
3769 a0 = VLOAD(M0)(0, lhs);
3770 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
3771
3772 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3773
3774 lhs += LHS_STEP_X;
3775
3776 a0 = VLOAD(M0)(0, lhs);
3777 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
3778
3779 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3780
3781 lhs += LHS_STEP_X;
3782
3783 a0 = VLOAD(M0)(0, lhs);
3784 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
3785
3786 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3787
3788 lhs += LHS_STEP_X;
3789
3790 a0 = VLOAD(M0)(0, lhs);
3791 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
3792
3793 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3794
3795 lhs += LHS_STEP_X;
3796#endif // K0 > 8
3797
3798#ifndef LHS_INTERLEAVE
3799 lhs += (M0 * K0 * (V0 - 1));
3800#endif // LHS_INTERLEAVE
3801
3802 x_rhs += K0 * RHS_STEP_X;
3803#ifndef RHS_INTERLEAVE
3804 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
3805#endif // RHS_INTERLEAVE
3806 }
3807
3808 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3809
3810 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3811
3812#if defined(REINTERPRET_OUTPUT_AS_3D)
3813
3814 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3815 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3816 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3817 // multiply dst_stride_z by DEPTH_GEMM3D
3818 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3819
3820#else // defined(REINTERPRET_OUTPUT_AS_3D)
3821
3822 // Add offset for batched GEMM
3823 dst_addr += z * dst_stride_z;
3824
3825#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3826
3827 // Multiply by the weight of matrix-matrix product and store the result
3828#if defined(ALPHA)
3829 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3830#endif // defined(ALPHA)
3831
3832 // Add beta*bias
3833#if defined(BETA)
3834#if defined(BROADCAST_BIAS)
3835 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3836
3837 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3838
3839#ifndef UNIT_BETA
3840 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3841#endif // UNIT_BIAS
3842
3843 // c = c + bias[broadcasted]
3844#if defined(MIXED_PRECISION)
3845 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3846 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3847#else // defined(MIXED_PRECISION)
3848 ADD_BLOCK_BROADCAST(M0, c, bias0);
3849#endif // defined(MIXED_PRECISION)
3850
3851#else // defined(BROADCAST_BIAS)
3852 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3853
3854 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3855
3856#ifndef UNIT_BETA
3857 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3858#endif // UNIT_BIAS
3859
3860#if defined(MIXED_PRECISION)
3861 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3862 ADD_BLOCK(M0, c, bias_hp);
3863#else // defined(MIXED_PRECISION)
3864 ADD_BLOCK(M0, c, bias);
3865#endif // defined(MIXED_PRECISION)
3866
3867#endif // defined(BROADCAST_BIAS)
3868#endif // defined(BETA)
3869
3870#if defined(ACTIVATION_TYPE)
3871#if defined(MIXED_PRECISION)
3872 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3873#else // defined(MIXED_PRECISION)
3874 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
3875#endif // defined(MIXED_PRECISION)
3876#endif // defined(ACTIVATION_TYPE)
3877
3878 // Store output block
3879#if defined(MIXED_PRECISION)
3880 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3881#else // defined(MIXED_PRECISION)
3882 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3883#endif // defined(MIXED_PRECISION)
3884
3885#undef LHS_BLOCK_SIZE
3886#undef LHS_OFFSET_X
3887#undef LHS_STEP_X
3888#undef RHS_BLOCK_SIZE
3889#undef RHS_OFFSET_X
3890#undef RHS_STEP_X
3891#undef PIXEL_UNIT
3892#undef LHS_STEP_LOOP
3893#undef RHS_STEP_LOOP
3894}
3895#endif // defined(OPENCL_IMAGE_SUPPORT)
3896
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003897#endif // defined(LHS_TRANSPOSE)
3898
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00003899#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
3900
giuros01b3204e72019-04-01 13:50:22 +01003901#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
3902
3903#define VFMA(a, b, c) \
3904 ({ \
3905 c = fma(a, b, c); \
3906 })
3907
3908#if M0 == 1
3909#define RHS_VFMA_M0xN0(i, a, b, c) \
3910 ({ \
3911 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3912 })
3913#elif M0 == 2 // M0 == 2
3914#define RHS_VFMA_M0xN0(i, a, b, c) \
3915 ({ \
3916 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3917 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3918 })
3919#elif M0 == 3 // M0 == 3
3920#define RHS_VFMA_M0xN0(i, a, b, c) \
3921 ({ \
3922 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3923 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3924 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3925 })
3926#elif M0 == 4 // M0 == 4
3927#define RHS_VFMA_M0xN0(i, a, b, c) \
3928 ({ \
3929 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3930 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3931 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3932 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3933 })
3934#elif M0 == 5 // M0 == 5
3935#define RHS_VFMA_M0xN0(i, a, b, c) \
3936 ({ \
3937 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3938 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3939 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3940 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3941 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3942 })
3943#elif M0 == 6 // M0 == 6
3944#define RHS_VFMA_M0xN0(i, a, b, c) \
3945 ({ \
3946 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3947 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3948 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3949 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3950 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3951 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3952 })
3953#elif M0 == 7 // M0 == 7
3954#define RHS_VFMA_M0xN0(i, a, b, c) \
3955 ({ \
3956 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3957 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3958 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3959 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3960 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3961 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3962 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3963 })
3964#elif M0 == 8 // M0 == 8
3965#define RHS_VFMA_M0xN0(i, a, b, c) \
3966 ({ \
3967 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3968 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3969 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3970 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3971 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3972 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3973 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3974 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
3975 })
3976#else // M0 not supported
3977#error "M0 not supported"
3978#endif // M0 not supported
3979
3980/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3981 * The LHS matrix is NOT reshaped
3982 * The RHS matrix is NOT reshaped
3983 *
3984 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01003985 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
3986 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
3987 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
3988 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
3989 * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
SiCong Li3a501662020-06-26 10:02:06 +01003990 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3991 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
giuros01b3204e72019-04-01 13:50:22 +01003992 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3993 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
3994 * - N0 = 2, 3, 4, 8, 16
3995 * - K0 = 2, 3, 4, 8, 16
3996 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01003997 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01003998 * The activation function is performed after the bias addition
giuros01b3204e72019-04-01 13:50:22 +01003999 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
4000 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
4001 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4002 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4003 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4004 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
4005 *
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004006 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
4007 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
4008 * @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)
4009 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
4010 * @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)
4011 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
4012 * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
4013 * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
4014 * @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)
4015 * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
4016 * @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)
4017 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004018 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4019 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4020 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
4021 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4022 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
4023 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
4024 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
4025 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4026 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
4027 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4028 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
4029 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4030 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
4031 * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
4032 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
4033 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4034 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
4035 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
giuros01b3204e72019-04-01 13:50:22 +01004036 */
4037__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
4038 IMAGE_DECLARATION(rhs),
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004039#if defined(BETA)
4040 IMAGE_DECLARATION(bias),
4041#endif // defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004042 IMAGE_DECLARATION(dst),
4043 uint lhs_stride_z,
4044 uint rhs_stride_z,
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004045#if defined(BETA)
4046 uint bias_stride_z,
4047#endif //defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004048 uint dst_stride_z
4049#if defined(REINTERPRET_INPUT_AS_3D)
4050 ,
4051 uint lhs_cross_plane_pad
4052#endif // REINTERPRET_INPUT_AS_3D
4053#if defined(REINTERPRET_OUTPUT_AS_3D)
4054 ,
4055 uint dst_cross_plane_pad
4056#endif // REINTERPRET_OUTPUT_AS_3D
4057 )
4058{
4059 // Block size
4060#define RHS_BLOCK_SIZE ((K0) * (N0))
4061
4062 // RHS offset and step X
4063#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
4064
4065 uint x = get_global_id(0);
4066 uint y = get_global_id(1);
4067 uint z = get_global_id(2);
4068
4069#if defined(DUMMY_WORK_ITEMS)
4070 if((x * N0 >= N) || (y * M0 >= M))
4071 {
4072 return;
4073 }
4074#endif // defined(DUMMY_WORK_ITEMS)
4075
4076 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01004077 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
giuros01b3204e72019-04-01 13:50:22 +01004078
4079 // Compute RHS matrix address
4080 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
4081
4082#if defined(MATRIX_B_DEPTH)
4083 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4084 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
4085#else // defined(MATRIX_B_DEPTH)
4086 rhs_offset += z * rhs_stride_z;
4087#endif // defined(MATRIX_B_DEPTH)
4088
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004089 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
4090 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
giuros01b3204e72019-04-01 13:50:22 +01004091
4092#if defined(REINTERPRET_INPUT_AS_3D)
4093 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4094 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
4095
4096 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4097 // multiply lhs_stride_z by DEPTH_GEMM3D
4098 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
4099
4100#else // defined(REINTERPRET_INPUT_AS_3D)
4101
4102 // Add offset for batched GEMM
4103 lhs_offset += z * lhs_stride_z;
4104
4105#endif // defined(REINTERPRET_INPUT_AS_3D)
4106
4107 // Initialize the accumulators
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004108 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
giuros01b3204e72019-04-01 13:50:22 +01004109
4110 int i = 0;
4111 for(; i <= (K - K0); i += K0)
4112 {
4113 // Supported cases (M0, K0):
4114 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
4115 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
4116 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
4117 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
4118 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
4119 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
4120 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
4121 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
4122 // Load values from LHS matrix
4123 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
4124
4125 // Load values from RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004126 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
giuros01b3204e72019-04-01 13:50:22 +01004127
4128 RHS_VFMA_M0xN0(0, a, b0, c);
4129 RHS_VFMA_M0xN0(1, a, b1, c);
4130#if K0 > 2
4131 RHS_VFMA_M0xN0(2, a, b2, c);
4132#endif // K0 > 2
4133#if K0 > 3
4134 RHS_VFMA_M0xN0(3, a, b3, c);
4135#endif // K0 > 3
4136#if K0 > 4
4137 RHS_VFMA_M0xN0(4, a, b4, c);
4138 RHS_VFMA_M0xN0(5, a, b5, c);
4139 RHS_VFMA_M0xN0(6, a, b6, c);
4140 RHS_VFMA_M0xN0(7, a, b7, c);
4141#endif // K0 > 4
4142#if K0 > 8
4143 RHS_VFMA_M0xN0(8, a, b8, c);
4144 RHS_VFMA_M0xN0(9, a, b9, c);
Gian Marco Iodice7b9d7ca2019-09-19 16:37:39 +01004145 RHS_VFMA_M0xN0(A, a, bA, c);
4146 RHS_VFMA_M0xN0(B, a, bB, c);
4147 RHS_VFMA_M0xN0(C, a, bC, c);
4148 RHS_VFMA_M0xN0(D, a, bD, c);
4149 RHS_VFMA_M0xN0(E, a, bE, c);
4150 RHS_VFMA_M0xN0(F, a, bF, c);
giuros01b3204e72019-04-01 13:50:22 +01004151#endif // K0 > 8
4152
4153 lhs_offset += K0 * sizeof(DATA_TYPE);
4154 rhs_offset += K0 * rhs_stride_y;
4155 }
4156
4157 // Left-over accumulations
4158 for(; i < K; ++i)
4159 {
4160 // Load values from LHS matrix
4161 VEC_DATA_TYPE(DATA_TYPE, 2)
4162 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
4163#if M0 > 1
4164 VEC_DATA_TYPE(DATA_TYPE, 2)
4165 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
4166#endif // M0 > 1
4167#if M0 > 2
4168 VEC_DATA_TYPE(DATA_TYPE, 2)
4169 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
4170#endif // M0 > 2
4171#if M0 > 3
4172 VEC_DATA_TYPE(DATA_TYPE, 2)
4173 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
4174#endif // M0 > 3
4175#if M0 > 4
4176 VEC_DATA_TYPE(DATA_TYPE, 2)
4177 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
4178#endif // M0 > 4
4179#if M0 > 5
4180 VEC_DATA_TYPE(DATA_TYPE, 2)
4181 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
4182#endif // M0 > 5
4183#if M0 > 6
4184 VEC_DATA_TYPE(DATA_TYPE, 2)
4185 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
4186#endif // M0 > 6
4187#if M0 > 7
4188 VEC_DATA_TYPE(DATA_TYPE, 2)
4189 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
4190#endif // M0 > 7
4191
4192 VEC_DATA_TYPE(DATA_TYPE, N0)
4193 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
4194 RHS_VFMA_M0xN0(0, a, b, c);
4195
4196 lhs_offset += sizeof(DATA_TYPE);
4197 rhs_offset += rhs_stride_y;
4198 }
4199
SiCong Li406a13f2020-07-15 12:09:58 +01004200 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004201
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004202 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
giuros01b3204e72019-04-01 13:50:22 +01004203
4204#if defined(REINTERPRET_OUTPUT_AS_3D)
4205 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4206 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
4207
4208 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4209 // multiply dst_stride_z by DEPTH_GEMM3D
4210 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
4211
4212#else // defined(REINTERPRET_OUTPUT_AS_3D)
4213
4214 // Add offset for batched GEMM
4215 dst_addr += z * dst_stride_z;
4216
4217#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4218
4219 // Multiply by the weight of matrix-matrix product and store the result
giuros01b3204e72019-04-01 13:50:22 +01004220#if defined(ALPHA)
4221 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
4222#endif // defined(ALPHA)
4223
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004224 // Add beta*bias
4225#if defined(BETA)
4226#if defined(BROADCAST_BIAS)
4227 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
4228
4229 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4230
4231#ifndef UNIT_BETA
4232 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
4233#endif // UNIT_BIAS
4234
4235 // c = c + bias[broadcasted]
4236 ADD_BLOCK_BROADCAST(M0, c, bias0);
4237
4238#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01004239 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004240
4241 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4242
4243#ifndef UNIT_BETA
4244 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
4245#endif // UNIT_BIAS
4246
4247 // c = c + bias
4248 ADD_BLOCK(M0, c, bias);
4249
4250#endif // defined(BROADCAST_BIAS)
4251#endif // defined(BETA)
4252
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004253#if defined(ACTIVATION_TYPE)
4254 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
4255#endif // defined(ACTIVATION_TYPE)
4256
giuros01b3204e72019-04-01 13:50:22 +01004257 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01004258 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
giuros01b3204e72019-04-01 13:50:22 +01004259
4260#undef RHS_BLOCK_SIZE
4261#undef RHS_OFFSET_X
4262#undef RHS_STEP_X
4263}
4264#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
4265
Gian Marco36a0a462018-01-12 10:21:40 +00004266#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004267/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004268 *
Gian Marco19835e52018-01-30 13:35:54 +00004269 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004270 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4271 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4272 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4273 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004274 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004275 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4276 * The activation function is performed after the bias addition
4277 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004278 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4279 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4280 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4281 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4282 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004283 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4284 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4285 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4286 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4287 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4288 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004289 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004290 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4291 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4292 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4293 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4294 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004295 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4296 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4297 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4298 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4299 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4300 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004301 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004302 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004303 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004304 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004305 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004306 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004307 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4308 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004309 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004310 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004311 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004312 */
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004313__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
4314 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004315#if defined(BETA)
4316 IMAGE_DECLARATION(src2),
4317#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004318 IMAGE_DECLARATION(dst),
4319 uint src0_stride_z,
4320 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004321#if defined(BETA)
4322 uint src2_stride_z,
4323#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004324 uint dst_stride_z
4325#if defined(REINTERPRET_OUTPUT_AS_3D)
4326 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004327 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004328#endif // REINTERPRET_OUTPUT_AS_3D
4329 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004330{
Gian Marco36a0a462018-01-12 10:21:40 +00004331 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4332 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004333 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004334
Gian Marco36a0a462018-01-12 10:21:40 +00004335 // Offset
4336 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4337 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004338
Gian Marco36a0a462018-01-12 10:21:40 +00004339 // src_addr_a = address of matrix A
4340 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004341 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4342 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4343
4344#if defined(MATRIX_B_DEPTH)
4345 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4346 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4347#else // defined(MATRIX_B_DEPTH)
4348 src1_addr_in_bytes += z * src1_stride_z;
4349#endif // defined(MATRIX_B_DEPTH)
4350
4351 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4352 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004353
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004354 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004355 __global float *src_end_addr_b = src_addr_b + COLS_B;
4356
4357 src_addr_a += offset_row_a;
4358 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004359
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004360 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004361 float4 c0 = 0.0f;
4362 float4 c1 = 0.0f;
4363 float4 c2 = 0.0f;
4364 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004365
Gian Marco36a0a462018-01-12 10:21:40 +00004366 for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004367 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004368 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004369 float4 a0 = vload4(0, src_addr_a);
4370 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004371
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004372 c0 += (float4)a0.s0 * b0;
4373 c1 += (float4)a0.s1 * b0;
4374 c2 += (float4)a0.s2 * b0;
4375 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004376
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004377 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004378 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4379 b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004380
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004381 c0 += (float4)a0.s0 * b0;
4382 c1 += (float4)a0.s1 * b0;
4383 c2 += (float4)a0.s2 * b0;
4384 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004385 }
4386
Gian Marco36a0a462018-01-12 10:21:40 +00004387 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004388 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004389 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004390 float4 a0 = vload4(0, src_addr_a);
4391 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004392
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004393 c0 += (float4)a0.s0 * b0;
4394 c1 += (float4)a0.s1 * b0;
4395 c2 += (float4)a0.s2 * b0;
4396 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004397 }
4398
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004399 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004400 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4401
Gian Marcoae2af742018-02-15 12:35:44 +00004402 // Compute dst address
4403 __global uchar *dst_addr = offset(&dst, 0, 0);
4404
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004405 uint4 zout = 0;
4406
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004407#if defined(REINTERPRET_OUTPUT_AS_3D)
4408 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004409 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004410 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004411 // | |
4412 // | plane0 |
4413 // | |
4414 // |__________________|
4415 // |******************|
4416 // | cross_plane_pad |
4417 // |******************|
4418 // | |
4419 // | plane1 |
4420 // | |
4421 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004422
4423 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004424 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4425 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004426
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004427 // Add offset due to the cross plane paddings
4428 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004429
4430 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4431 // multiply dst_stride_z by DEPTH_GEMM3D
4432 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004433#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004434 // Add offset for batched GEMM
4435 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004436#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4437
4438 // Multiply by the weight of matrix-matrix product and store the result
4439#if defined(ALPHA)
4440 SCALE_BLOCK(4, float, c, ALPHA);
4441#endif // defined(ALPHA)
4442
4443 // Add beta*bias
4444#if defined(BETA)
4445 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4446
4447#if defined(BROADCAST_BIAS)
4448 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4449
4450 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4451
4452#ifndef UNIT_BETA
4453 SCALE_BLOCK(1, float, bias, BETA);
4454#endif // UNIT_BIAS
4455
4456 // c = c + bias[broadcasted]
4457 ADD_BLOCK_BROADCAST(4, c, bias0);
4458
4459#else // defined(BROADCAST_BIAS)
4460 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4461 2) * src2_stride_z;
4462
4463 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4464
4465#ifndef UNIT_BETA
4466 SCALE_BLOCK(4, float, bias, BETA);
4467#endif // UNIT_BIAS
4468
4469 // c = c + bias
4470 ADD_BLOCK(4, c, bias);
4471
4472#endif // defined(BROADCAST_BIAS)
4473#endif // defined(BETA)
4474
4475#if defined(ACTIVATION_TYPE)
4476 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
4477#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004478
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004479 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004480 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4481 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4482 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4483 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004484}
4485
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004486/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004487 *
Gian Marco19835e52018-01-30 13:35:54 +00004488 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004489 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4490 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4491 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4492 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4493 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004494 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004495 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4496 * The activation function is performed after the bias addition
4497 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004498 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4499 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4500 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4501 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4502 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004503 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4504 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4505 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4506 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4507 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4508 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004509 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004510 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4511 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4512 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4513 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4514 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004515 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4516 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4517 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4518 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4519 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4520 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004521 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004522 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004523 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004524 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004525 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004526 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004527 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4528 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004529 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004530 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004531 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004532 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004533__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
4534 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004535#if defined(BETA)
4536 IMAGE_DECLARATION(src2),
4537#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004538 IMAGE_DECLARATION(dst),
4539 uint src0_stride_z,
4540 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004541#if defined(BETA)
4542 uint src2_stride_z,
4543#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004544 uint dst_stride_z
4545#if defined(REINTERPRET_OUTPUT_AS_3D)
4546 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004547 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004548#endif // REINTERPRET_OUTPUT_AS_3D
4549 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004550{
Gian Marco36a0a462018-01-12 10:21:40 +00004551 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4552 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004553 int z = get_global_id(2);
Gian Marco36a0a462018-01-12 10:21:40 +00004554
4555 // Offset
4556 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4557 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
4558
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004559 // src_addr_a = address of matrix A
4560 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004561 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4562 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4563
4564#if defined(MATRIX_B_DEPTH)
4565 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4566 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4567#else // defined(MATRIX_B_DEPTH)
4568 src1_addr_in_bytes += z * src1_stride_z;
4569#endif // defined(MATRIX_B_DEPTH)
4570
4571 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4572 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004573
Gian Marco36a0a462018-01-12 10:21:40 +00004574 src_addr_a += offset_row_a;
4575 src_addr_b += offset_row_b;
4576
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004577 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004578 float4 c0 = 0.0f;
4579 float4 c1 = 0.0f;
4580 float4 c2 = 0.0f;
4581 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004582
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004583#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
4584
4585 int i = 0;
4586 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004587 {
4588 // Load values from matrix A (interleaved) and matrix B (transposed)
4589 float4 a0 = vload4(0, src_addr_a);
4590 float4 b0 = vload4(0, src_addr_b);
4591
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004592 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4593 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004594
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004595 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4596 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4597 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4598 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004599
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004600 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4601 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4602 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4603 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004604
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004605 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4606 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4607 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4608 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004609
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004610 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4611 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4612 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4613 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004614
4615 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004616 a0 = vload4(0, src_addr_a);
4617 b0 = vload4(0, src_addr_b);
4618
4619 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4620 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004621
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004622 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4623 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4624 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4625 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004626
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004627 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4628 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4629 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4630 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004631
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004632 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4633 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4634 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4635 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004636
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004637 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4638 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4639 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4640 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004641
4642 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004643 a0 = vload4(0, src_addr_a);
4644 b0 = vload4(0, src_addr_b);
4645
4646 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4647 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4648
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004649 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4650 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4651 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4652 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004653
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004654 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4655 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4656 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4657 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004658
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004659 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4660 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4661 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4662 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004663
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004664 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4665 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4666 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4667 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004668
4669 // Load values from matrix A (interleaved) and matrix B (transposed)
4670 a0 = vload4(0, src_addr_a);
4671 b0 = vload4(0, src_addr_b);
4672
4673 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4674 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004675
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004676 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4677 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4678 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4679 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004680
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004681 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4682 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4683 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4684 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004685
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004686 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4687 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4688 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4689 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004690
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004691 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4692 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4693 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4694 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004695 }
4696
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004697 for(; i < (int)(COLS_MTX_B); ++i)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004698 {
4699 // Load values from matrix A (interleaved) and matrix B (transposed)
4700 float4 a0 = vload4(0, src_addr_a);
4701 float4 b0 = vload4(0, src_addr_b);
4702
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004703 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4704 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4705
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004706 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4707 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4708 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4709 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004710
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004711 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4712 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4713 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4714 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004715
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004716 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4717 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4718 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4719 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004720
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004721 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4722 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4723 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4724 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004725 }
4726
4727 // Compute destination address
4728 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4729
Gian Marcoae2af742018-02-15 12:35:44 +00004730 // Compute dst address
4731 __global uchar *dst_addr = offset(&dst, 0, 0);
4732
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004733 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004734
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004735#if defined(REINTERPRET_OUTPUT_AS_3D)
4736 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004737 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004738 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004739 // | |
4740 // | plane0 |
4741 // | |
4742 // |__________________|
4743 // |******************|
4744 // | cross_plane_pad |
4745 // |******************|
4746 // | |
4747 // | plane1 |
4748 // | |
4749 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004750
4751 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004752 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4753 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004754
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004755 // Add offset due to the cross plane paddings
4756 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004757
4758 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4759 // multiply dst_stride_z by DEPTH_GEMM3D
4760 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004761#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004762 // Add offset for batched GEMM
4763 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004764#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4765
4766 // Multiply by the weight of matrix-matrix product and store the result
4767#if defined(ALPHA)
4768 SCALE_BLOCK(4, float, c, ALPHA);
4769#endif // defined(ALPHA)
4770
4771 // Add beta*bias
4772#if defined(BETA)
4773 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4774
4775#if defined(BROADCAST_BIAS)
4776 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4777
4778 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4779
4780#ifndef UNIT_BETA
4781 SCALE_BLOCK(1, float, bias, BETA);
4782#endif // UNIT_BIAS
4783
4784 // c = c + bias[broadcasted]
4785 ADD_BLOCK_BROADCAST(4, c, bias0);
4786
4787#else // defined(BROADCAST_BIAS)
4788 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4789 2) * src2_stride_z;
4790
4791 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4792
4793#ifndef UNIT_BETA
4794 SCALE_BLOCK(4, float, bias, BETA);
4795#endif // UNIT_BIAS
4796
4797 // c = c + bias
4798 ADD_BLOCK(4, c, bias);
4799
4800#endif // defined(BROADCAST_BIAS)
4801#endif // defined(BETA)
4802
4803#if defined(ACTIVATION_TYPE)
4804 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
4805#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004806
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004807 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004808 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4809 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4810 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4811 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004812}
4813
Georgios Pinitas84225582018-05-14 12:00:05 +01004814// Undefine local defines
4815#undef COLS_MTX_B
4816
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01004817#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004818/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004819 *
Gian Marco19835e52018-01-30 13:35:54 +00004820 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004821 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4822 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4823 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4824 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004825 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004826 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4827 * The activation function is performed after the bias addition
4828 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004829 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4830 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4831 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4832 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4833 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004834 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
4835 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4836 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4837 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4838 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4839 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004840 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004841 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4842 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4843 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4844 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4845 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004846 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4847 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4848 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4849 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4850 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4851 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004852 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004853 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004854 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004855 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004856 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004857 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004858 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4859 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004860 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004861 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004862 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004863 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004864__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
4865 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004866#if defined(BETA)
4867 IMAGE_DECLARATION(src2),
4868#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004869 IMAGE_DECLARATION(dst),
4870 uint src0_stride_z,
4871 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004872#if defined(BETA)
4873 uint src2_stride_z,
4874#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004875 uint dst_stride_z
4876#if defined(REINTERPRET_OUTPUT_AS_3D)
4877 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004878 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004879#endif // REINTERPRET_OUTPUT_AS_3D
4880 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004881{
Gian Marco36a0a462018-01-12 10:21:40 +00004882 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4883 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004884 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004885
Gian Marco36a0a462018-01-12 10:21:40 +00004886 // Offset
4887 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4888 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004889
Gian Marco36a0a462018-01-12 10:21:40 +00004890 // src_addr_a = address of matrix A
4891 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004892 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4893 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4894
4895#if defined(MATRIX_B_DEPTH)
4896 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4897 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4898#else // defined(MATRIX_B_DEPTH)
4899 src1_addr_in_bytes += z * src1_stride_z;
4900#endif // defined(MATRIX_B_DEPTH)
4901
4902 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
4903 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004904
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004905 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004906 __global half *src_end_addr_b = src_addr_b + COLS_B;
4907
4908 src_addr_a += offset_row_a;
4909 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004910
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004911 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004912 half8 c0 = 0.0f;
4913 half8 c1 = 0.0f;
4914 half8 c2 = 0.0f;
4915 half8 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004916
Gian Marco36a0a462018-01-12 10:21:40 +00004917 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004918 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004919 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004920 half4 a0 = vload4(0, src_addr_a);
4921 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004922
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004923 c0 += (half8)a0.s0 * b0;
4924 c1 += (half8)a0.s1 * b0;
4925 c2 += (half8)a0.s2 * b0;
4926 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004927
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004928 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004929 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4930 b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004931
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004932 c0 += (half8)a0.s0 * b0;
4933 c1 += (half8)a0.s1 * b0;
4934 c2 += (half8)a0.s2 * b0;
4935 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004936 }
4937
Gian Marco36a0a462018-01-12 10:21:40 +00004938 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004939 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004940 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004941 half4 a0 = vload4(0, src_addr_a);
4942 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004943
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004944 c0 += (half8)a0.s0 * b0;
4945 c1 += (half8)a0.s1 * b0;
4946 c2 += (half8)a0.s2 * b0;
4947 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004948 }
4949
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004950 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004951 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4952
Gian Marcoae2af742018-02-15 12:35:44 +00004953 // Compute dst address
4954 __global uchar *dst_addr = offset(&dst, 0, 0);
4955
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004956 uint4 zout = 0;
4957
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004958#if defined(REINTERPRET_OUTPUT_AS_3D)
4959 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004960 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004961 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004962 // | |
4963 // | plane0 |
4964 // | |
4965 // |__________________|
4966 // |******************|
4967 // | cross_plane_pad |
4968 // |******************|
4969 // | |
4970 // | plane1 |
4971 // | |
4972 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004973
4974 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004975 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4976 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004977
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004978 // Add offset due to the cross plane paddings
4979 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004980
4981 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4982 // multiply dst_stride_z by DEPTH_GEMM3D
4983 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004984#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004985 // Add offset for batched GEMM
4986 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004987#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4988
4989 // Multiply by the weight of matrix-matrix product and store the result
4990#if defined(ALPHA)
4991 SCALE_BLOCK(4, half, c, ALPHA);
4992#endif // defined(ALPHA)
4993
4994 // Add beta*bias
4995#if defined(BETA)
4996 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4997
4998#if defined(BROADCAST_BIAS)
4999 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5000
5001 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5002
5003#ifndef UNIT_BETA
5004 SCALE_BLOCK(1, half, bias, BETA);
5005#endif // UNIT_BIAS
5006
5007 // c = c + bias[broadcasted]
5008 ADD_BLOCK_BROADCAST(4, c, bias0);
5009
5010#else // defined(BROADCAST_BIAS)
5011
5012 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5013 2) * src2_stride_z;
5014
5015 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5016
5017#ifndef UNIT_BETA
5018 SCALE_BLOCK(4, half, bias, BETA);
5019#endif // UNIT_BIAS
5020
5021 // c = c + bias
5022 ADD_BLOCK(4, c, bias);
5023
5024#endif // defined(BROADCAST_BIAS)
5025#endif // defined(BETA)
5026
5027#if defined(ACTIVATION_TYPE)
5028 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
5029#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00005030
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005031 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005032 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5033 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5034 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5035 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005036}
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005037
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005038/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005039 *
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005040 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005041 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5042 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5043 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5044 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005045 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005046 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5047 * The activation function is performed after the bias addition
5048 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005049 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5050 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5051 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5052 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5053 *
5054 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5055 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5056 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5057 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5058 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5059 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5060 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5061 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5062 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5063 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5064 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5065 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005066 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5067 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5068 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5069 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5070 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5071 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005072 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5073 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5074 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5075 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5076 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5077 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
5078 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5079 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005080 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005081 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
5082 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
5083 */
5084__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
5085 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005086#if defined(BETA)
5087 IMAGE_DECLARATION(src2),
5088#endif // defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005089 IMAGE_DECLARATION(dst),
5090 uint src0_stride_z,
5091 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005092#if defined(BETA)
5093 uint src2_stride_z,
5094#endif //defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005095 uint dst_stride_z
5096#if defined(REINTERPRET_OUTPUT_AS_3D)
5097 ,
5098 uint cross_plane_pad
5099#endif // REINTERPRET_OUTPUT_AS_3D
5100 )
5101{
5102 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5103 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5104 int z = get_global_id(2);
5105
5106 // Offset
5107 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5108 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5109
5110 // src_addr_a = address of matrix A
5111 // src_addr_b = address of matrix B
5112 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5113 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5114
5115#if defined(MATRIX_B_DEPTH)
5116 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5117 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5118#else // defined(MATRIX_B_DEPTH)
5119 src1_addr_in_bytes += z * src1_stride_z;
5120#endif // defined(MATRIX_B_DEPTH)
5121
5122 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5123 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5124
5125 // Compute end row address for matrix B
5126 __global half *src_end_addr_b = src_addr_b + COLS_B;
5127
5128 src_addr_a += offset_row_a;
5129 src_addr_b += offset_row_b;
5130
5131 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005132 float8 c0 = 0.0f;
5133 float8 c1 = 0.0f;
5134 float8 c2 = 0.0f;
5135 float8 c3 = 0.0f;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005136
5137 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
5138 {
5139 // Load values from matrix A (interleaved) and matrix B (transposed)
5140 float4 a0 = convert_float4(vload4(0, src_addr_a));
5141 float8 b0 = convert_float8(vload8(0, src_addr_b));
5142
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005143 c0 += (float8)a0.s0 * b0;
5144 c1 += (float8)a0.s1 * b0;
5145 c2 += (float8)a0.s2 * b0;
5146 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005147
5148 // Load values from matrix A (interleaved) and matrix B (transposed)
5149 a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
5150 b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
5151
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005152 c0 += (float8)a0.s0 * b0;
5153 c1 += (float8)a0.s1 * b0;
5154 c2 += (float8)a0.s2 * b0;
5155 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005156 }
5157
5158 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
5159 {
5160 // Load values from matrix A (interleaved) and matrix B (transposed)
5161 float4 a0 = convert_float4(vload4(0, src_addr_a));
5162 float8 b0 = convert_float8(vload8(0, src_addr_b));
5163
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005164 c0 += (float8)a0.s0 * b0;
5165 c1 += (float8)a0.s1 * b0;
5166 c2 += (float8)a0.s2 * b0;
5167 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005168 }
5169
5170 // Compute destination address
5171 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5172
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005173 // Compute dst address
5174 __global uchar *dst_addr = offset(&dst, 0, 0);
5175
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005176 uint4 zout = 0;
5177
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005178#if defined(REINTERPRET_OUTPUT_AS_3D)
5179 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
5180 // in order to take into account the presence of possible cross plane paddings
5181 //
5182 // | |
5183 // | plane0 |
5184 // | |
5185 // |__________________|
5186 // |******************|
5187 // | cross_plane_pad |
5188 // |******************|
5189 // | |
5190 // | plane1 |
5191 // | |
5192 // |__________________|
5193
5194 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005195 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5196 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005197
5198 // Add offset due to the cross plane paddings
5199 zout *= (cross_plane_pad * dst_stride_y);
5200
5201 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5202 // multiply dst_stride_z by DEPTH_GEMM3D
5203 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005204#else // defined(REINTERPRET_OUTPUT_AS_3D)
5205 // Add offset for batched GEMM
5206 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005207#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5208
5209 // Multiply by the weight of matrix-matrix product and store the result
5210#if defined(ALPHA)
5211 SCALE_BLOCK(4, float, c, ALPHA);
5212#endif // defined(ALPHA)
5213
5214#if defined(BETA)
5215 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5216
5217#if defined(BROADCAST_BIAS)
5218 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5219
5220 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5221
5222 float8 bias_f0 = convert_float8(bias0);
5223
5224#ifndef UNIT_BETA
5225 SCALE_BLOCK(1, float, bias_f, BETA);
5226#endif // UNIT_BIAS
5227
5228 // c = c + bias[broadcasted]
5229 ADD_BLOCK_BROADCAST(4, c, bias_f0);
5230
5231#else // defined(BROADCAST_BIAS)
5232 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5233 2) * src2_stride_z;
5234
5235 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5236
5237 float8 bias_f0 = convert_float8(bias0);
5238 float8 bias_f1 = convert_float8(bias1);
5239 float8 bias_f2 = convert_float8(bias2);
5240 float8 bias_f3 = convert_float8(bias3);
5241
5242#ifndef UNIT_BETA
5243 SCALE_BLOCK(4, float, bias_f, BETA);
5244#endif // UNIT_BIAS
5245
5246 // c = c + bias
5247 ADD_BLOCK(4, c, bias_f);
5248
5249#endif // defined(BROADCAST_BIAS)
5250#endif // defined(BETA)
5251
5252 half8 c_h0 = convert_half8(c0);
5253 half8 c_h1 = convert_half8(c1);
5254 half8 c_h2 = convert_half8(c2);
5255 half8 c_h3 = convert_half8(c3);
5256
5257#if defined(ACTIVATION_TYPE)
5258 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
5259#endif // defined(ACTIVATION_TYPE)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005260
5261 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005262 vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5263 vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5264 vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5265 vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005266}
5267
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005268/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005269 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005270 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005271 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5272 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5273 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5274 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005275 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005276 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5277 * The activation function is performed after the bias addition
5278 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005279 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5280 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5281 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5282 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5283 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005284 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5285 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5286 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5287 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5288 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5289 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5290 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5291 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5292 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5293 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5294 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5295 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005296 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5297 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5298 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5299 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5300 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5301 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005302 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5303 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5304 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5305 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5306 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5307 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005308 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5309 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
5310 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005311 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005312 */
5313__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
5314 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005315#if defined(BETA)
5316 IMAGE_DECLARATION(src2),
5317#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005318 IMAGE_DECLARATION(dst),
5319 uint src0_stride_z,
5320 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005321#if defined(BETA)
5322 uint src2_stride_z,
5323#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005324 uint dst_stride_z
5325#if defined(REINTERPRET_OUTPUT_AS_3D)
5326 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005327 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005328#endif // REINTERPRET_OUTPUT_AS_3D
5329 )
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005330{
5331 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5332 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5333 int z = get_global_id(2);
5334
5335 // Offset
5336 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5337 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5338
5339 // src_addr_a = address of matrix A
5340 // src_addr_b = address of matrix B
5341 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5342 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5343
5344#if defined(MATRIX_B_DEPTH)
5345 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5346 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5347#else // defined(MATRIX_B_DEPTH)
5348 src1_addr_in_bytes += z * src1_stride_z;
5349#endif // defined(MATRIX_B_DEPTH)
5350
5351 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5352 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5353
5354 // Compute end row address for matrix B
5355 __global half *src_end_addr_b = src_addr_b + COLS_B;
5356
5357 src_addr_a += offset_row_a;
5358 src_addr_b += offset_row_b;
5359
5360 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005361 half8 c0 = 0.0f;
5362 half8 c1 = 0.0f;
5363 half8 c2 = 0.0f;
5364 half8 c3 = 0.0f;
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005365
5366#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
5367
5368 int i = 0;
5369 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
5370 {
5371#if MULT_INTERLEAVE4X4_HEIGHT == 1
5372 // Load values from matrix A (interleaved) and matrix B (transposed)
5373 half8 a0 = vload8(0, src_addr_a);
5374 half8 b0 = vload8(0, src_addr_b);
5375
5376 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5377 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5378
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005379 c0 = fma((half8)a0.s0, b0, c0);
5380 c1 = fma((half8)a0.s1, b0, c1);
5381 c2 = fma((half8)a0.s2, b0, c2);
5382 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005383
5384 // Load values from matrix B (transposed)
5385 b0 = vload8(0, src_addr_b);
5386
5387 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5388
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005389 c0 = fma((half8)a0.s4, b0, c0);
5390 c1 = fma((half8)a0.s5, b0, c1);
5391 c2 = fma((half8)a0.s6, b0, c2);
5392 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005393
5394 // Load values from matrix A (interleaved) and matrix B (transposed)
5395 a0 = vload8(0, src_addr_a);
5396 b0 = vload8(0, src_addr_b);
5397
5398 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5399 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5400
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005401 c0 = fma((half8)a0.s0, b0, c0);
5402 c1 = fma((half8)a0.s1, b0, c1);
5403 c2 = fma((half8)a0.s2, b0, c2);
5404 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005405
5406 // Load values from matrix B (transposed)
5407 b0 = vload8(0, src_addr_b);
5408
5409 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5410
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005411 c0 = fma((half8)a0.s4, b0, c0);
5412 c1 = fma((half8)a0.s5, b0, c1);
5413 c2 = fma((half8)a0.s6, b0, c2);
5414 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005415#else // MULT_INTERLEAVE4X4_HEIGHT == 1
5416 // Load values from matrix A (interleaved) and matrix B (transposed)
5417 half4 a0 = vload4(0, src_addr_a);
5418 half8 b0 = vload8(0, src_addr_b);
5419
5420 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5421 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5422
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005423 c0 = fma((half8)a0.s0, b0, c0);
5424 c1 = fma((half8)a0.s1, b0, c1);
5425 c2 = fma((half8)a0.s2, b0, c2);
5426 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005427
5428 // Load values from matrix A (interleaved) and matrix B (transposed)
5429 a0 = vload4(0, src_addr_a);
5430 b0 = vload8(0, src_addr_b);
5431
5432 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5433 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5434
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005435 c0 = fma((half8)a0.s0, b0, c0);
5436 c1 = fma((half8)a0.s1, b0, c1);
5437 c2 = fma((half8)a0.s2, b0, c2);
5438 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005439
5440 // Load values from matrix A (interleaved) and matrix B (transposed)
5441 a0 = vload4(0, src_addr_a);
5442 b0 = vload8(0, src_addr_b);
5443
5444 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5445 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5446
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005447 c0 = fma((half8)a0.s0, b0, c0);
5448 c1 = fma((half8)a0.s1, b0, c1);
5449 c2 = fma((half8)a0.s2, b0, c2);
5450 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005451
5452 // Load values from matrix A (interleaved) and matrix B (transposed)
5453 a0 = vload4(0, src_addr_a);
5454 b0 = vload8(0, src_addr_b);
5455
5456 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5457 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5458
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005459 c0 = fma((half8)a0.s0, b0, c0);
5460 c1 = fma((half8)a0.s1, b0, c1);
5461 c2 = fma((half8)a0.s2, b0, c2);
5462 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005463#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
5464 }
5465
5466 for(; i < (int)(COLS_MTX_B); ++i)
5467 {
5468 // Load values from matrix A (interleaved) and matrix B (transposed)
5469 half4 a0 = vload4(0, src_addr_a);
5470 half8 b0 = vload8(0, src_addr_b);
5471
5472 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5473 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5474
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005475 c0 = fma((half8)a0.s0, b0, c0);
5476 c1 = fma((half8)a0.s1, b0, c1);
5477 c2 = fma((half8)a0.s2, b0, c2);
5478 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005479 }
5480
5481 // Compute destination address
5482 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5483
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005484 // Compute dst address
5485 __global uchar *dst_addr = offset(&dst, 0, 0);
5486
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005487 uint4 zout = 0;
5488
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005489#if defined(REINTERPRET_OUTPUT_AS_3D)
5490 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005491 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005492 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005493 // | |
5494 // | plane0 |
5495 // | |
5496 // |__________________|
5497 // |******************|
5498 // | cross_plane_pad |
5499 // |******************|
5500 // | |
5501 // | plane1 |
5502 // | |
5503 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005504
5505 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005506 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5507 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005508
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005509 // Add offset due to the cross plane paddings
5510 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005511
5512 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5513 // multiply dst_stride_z by DEPTH_GEMM3D
5514 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005515#else // defined(REINTERPRET_OUTPUT_AS_3D)
5516 // Add offset for batched GEMM
5517 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005518#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5519
5520 // Multiply by the weight of matrix-matrix product and store the result
5521#if defined(ALPHA)
5522 SCALE_BLOCK(4, half, c, ALPHA);
5523#endif // defined(ALPHA)
5524
5525 // Add beta*bias
5526#if defined(BETA)
5527 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5528
5529#if defined(BROADCAST_BIAS)
5530 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5531
5532 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5533
5534#ifndef UNIT_BETA
5535 SCALE_BLOCK(1, half, bias, BETA);
5536#endif // UNIT_BIAS
5537
5538 // c = c + bias[broadcasted]
5539 ADD_BLOCK_BROADCAST(4, c, bias0);
5540
5541#else // defined(BROADCAST_BIAS)
5542 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5543 2) * src2_stride_z;
5544
5545 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5546
5547#ifndef UNIT_BETA
5548 SCALE_BLOCK(4, half, bias, BETA);
5549#endif // UNIT_BIAS
5550
5551 // c = c + bias
5552 ADD_BLOCK(4, c, bias);
5553
5554#endif // defined(BROADCAST_BIAS)
5555#endif // defined(BETA)
5556
5557#if defined(ACTIVATION_TYPE)
5558 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
5559#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005560
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005561 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005562 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5563 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5564 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5565 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005566}
Georgios Pinitas84225582018-05-14 12:00:05 +01005567
5568// Undefine local defines
5569#undef COLS_MTX_B
5570
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01005571#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005572
Gian Marco36a0a462018-01-12 10:21:40 +00005573#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005574
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005575#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
5576#if defined(DATA_TYPE)
5577#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005578/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
5579 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005580 * @note This OpenCL kernel works with floating point data types (F16/F32)
5581 * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
5582 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005583 * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005584 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5585 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005586 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005587 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5588 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005589 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5590 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005591 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5592 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5593 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5594 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5595 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005596 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005597 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5598 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5599 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5600 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5601 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005602 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005603 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5604 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5605 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5606 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5607 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005608 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5609 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5610 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5611 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5612 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5613 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005614 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005615 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5616 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5617 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5618 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5619 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005620 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5621 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005622 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005623 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005624 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5625 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005626 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005627__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
5628 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005629#if defined(BETA)
5630 IMAGE_DECLARATION(src2),
5631#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005632 IMAGE_DECLARATION(dst),
5633 uint src0_stride_z,
5634 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005635#if defined(BETA)
5636 uint src2_stride_z,
5637#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005638 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005639#if defined(REINTERPRET_INPUT_AS_3D)
5640 ,
5641 uint src_cross_plane_pad
5642#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005643#if defined(REINTERPRET_OUTPUT_AS_3D)
5644 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005645 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005646#endif // REINTERPRET_OUTPUT_AS_3D
5647 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005648{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005649 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005650
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005651 // Compute starting address for matrix A and Matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005652 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005653
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005654 // Update address for the matrix A
5655 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005656
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005657 // Update address for the matrix B
5658 src_addr.s1 += idx * sizeof(DATA_TYPE);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005659
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005660#if defined(REINTERPRET_INPUT_AS_3D)
5661 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
5662 // in order to take into account the presence of possible cross plane paddings
5663 //
5664 // | |
5665 // | plane0 |
5666 // | |
5667 // |__________________|
5668 // |******************|
5669 // | cross_plane_pad |
5670 // |******************|
5671 // | |
5672 // | plane1 |
5673 // | |
5674 // |__________________|
5675
5676 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
5677 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5678 zin = min(DEPTH_GEMM3D - 1, zin);
5679
5680 // Add offset due to the cross plane paddings
5681 zin *= (src_cross_plane_pad * src0_stride_y);
5682
5683 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5684 // multiply src0_stride_z by DEPTH_GEMM3D
5685 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
5686
5687#else // defined(REINTERPRET_INPUT_AS_3D)
5688
Gian Marcoae2af742018-02-15 12:35:44 +00005689 // Add offset for batched GEMM
5690 src_addr.s0 += get_global_id(2) * src0_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005691
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005692#endif // defined(REINTERPRET_INPUT_AS_3D)
5693
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005694#if defined(MATRIX_B_DEPTH)
5695 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5696 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
5697#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005698 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005699#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005700
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005701 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
5702
5703 VECTOR_TYPE acc0 = 0.0f;
5704#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5705 VECTOR_TYPE acc1 = 0.0f;
5706#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5707#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5708 VECTOR_TYPE acc2 = 0.0f;
5709#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5710#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5711 VECTOR_TYPE acc3 = 0.0f;
5712#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5713
Georgios Pinitas96880cf2017-10-20 18:52:20 +01005714 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005715 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005716#if defined(REINTERPRET_INPUT_AS_3D)
5717 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01005718 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
5719#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005720 // Load values from matrix A
5721 VEC_DATA_TYPE(DATA_TYPE, 2)
5722 a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5723#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5724 VEC_DATA_TYPE(DATA_TYPE, 2)
5725 a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5726#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5727#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5728 VEC_DATA_TYPE(DATA_TYPE, 2)
5729 a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5730#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5731#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5732 VEC_DATA_TYPE(DATA_TYPE, 2)
5733 a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5734#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005735#endif // defined(REINTERPRET_INPUT_AS_3D)
5736
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005737 // Load values from matrix B
5738 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
5739 VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005740
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005741 // Accumulate
5742 acc0 += b0 * (VECTOR_TYPE)a0.s0;
5743 acc0 += b1 * (VECTOR_TYPE)a0.s1;
5744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5745 acc1 += b0 * (VECTOR_TYPE)a1.s0;
5746 acc1 += b1 * (VECTOR_TYPE)a1.s1;
5747#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5748#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5749 acc2 += b0 * (VECTOR_TYPE)a2.s0;
5750 acc2 += b1 * (VECTOR_TYPE)a2.s1;
5751#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5752#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5753 acc3 += b0 * (VECTOR_TYPE)a3.s0;
5754 acc3 += b1 * (VECTOR_TYPE)a3.s1;
5755#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005756 }
5757
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005758 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005759 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005760#if defined(REINTERPRET_INPUT_AS_3D)
5761 // Load values from matrix A
5762 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
5763#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5764 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
5765#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5766#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5767 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
5768#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5769#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5770 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
5771#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5772#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005773 // Load values from matrix A
5774 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5775#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5776 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5777#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5778#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5779 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5780#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5781#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5782 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5783#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005784#endif // defined(REINTERPRET_INPUT_AS_3D)
5785
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005786 // Load values from matrix B
5787 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005788
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005789 // Accumulate
5790 acc0 += b0 * (VECTOR_TYPE)a0;
5791#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5792 acc1 += b0 * (VECTOR_TYPE)a1;
5793#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5794#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5795 acc2 += b0 * (VECTOR_TYPE)a2;
5796#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5797#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5798 acc3 += b0 * (VECTOR_TYPE)a3;
5799#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005800 }
5801
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005802 int z = get_global_id(2);
5803
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005804 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005805 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5806
Gian Marcoae2af742018-02-15 12:35:44 +00005807 // Compute dst address
5808 __global uchar *dst_addr = offset(&dst, 0, 0);
5809
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005810 uint4 zout = 0;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005811
5812#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005813
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005814 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005815 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005816 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005817 // | |
5818 // | plane0 |
5819 // | |
5820 // |__________________|
5821 // |******************|
5822 // | cross_plane_pad |
5823 // |******************|
5824 // | |
5825 // | plane1 |
5826 // | |
5827 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005828
5829 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005830 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5831 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005832
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005833 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005834 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005835
5836 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5837 // multiply dst_stride_z by DEPTH_GEMM3D
5838 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005839#else // defined(REINTERPRET_OUTPUT_AS_3D)
5840 // Add offset for batched GEMM
5841 dst_addr += z * dst_stride_z;
5842#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5843
5844 // Multiply by the weight of matrix-matrix product and store the result
5845#if defined(ALPHA)
5846 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
5847#endif // defined(ALPHA)
5848
5849 // Add beta*bias
5850#if defined(BETA)
5851 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
5852
5853#if defined(BROADCAST_BIAS)
5854 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
5855
5856 LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5857
5858#ifndef UNIT_BETA
5859 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5860#endif // UNIT_BIAS
5861
5862 // c = c + bias[broadcasted]
5863 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
5864
5865#else // defined(BROADCAST_BIAS)
5866 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *
5867 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
5868
5869 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5870
5871#ifndef UNIT_BETA
5872 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
5873#endif // UNIT_BIAS
5874
5875 // c = c + bias
5876 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
5877
5878#endif // defined(BROADCAST_BIAS)
5879#endif // defined(BETA)
5880
5881#if defined(ACTIVATION_TYPE)
5882 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
5883#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005884
5885 // Store output block
Usama Arif0681e3b2019-04-25 14:28:07 +01005886 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005887}
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005888#endif // defined(DATA_TYPE)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005889
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +01005890/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005891 *
5892 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
5893 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
5894 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
5895 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
5896 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005897 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5898 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005899 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005900 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5901 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005902 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5903 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005904 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5905 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5906 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5907 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5908 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005909 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005910 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5911 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5912 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5913 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5914 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5915 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5916 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5917 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5918 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5919 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5920 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005921 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5922 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5923 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5924 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5925 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5926 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005927 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5928 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5929 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5930 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5931 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5932 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005933 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5934 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005935 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005936 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005937 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5938 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005939 */
5940__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
5941 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005942#if defined(BETA)
5943 IMAGE_DECLARATION(src2),
5944#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005945 IMAGE_DECLARATION(dst),
5946 uint src0_stride_z,
5947 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005948#if defined(BETA)
5949 uint src2_stride_z,
5950#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005951 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005952#if defined(REINTERPRET_INPUT_AS_3D)
5953 ,
5954 uint src_cross_plane_pad
5955#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005956#if defined(REINTERPRET_OUTPUT_AS_3D)
5957 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005958 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005959#endif // REINTERPRET_OUTPUT_AS_3D
5960 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005961{
5962 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
5963
5964 // Compute starting address for matrix A and matrix B
5965 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
5966
5967 // Update address for matrix A
5968 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
5969
5970 // Update address for matrix B
5971 src_addr.s1 += idx * sizeof(float);
5972
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005973#if defined(REINTERPRET_INPUT_AS_3D)
5974 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
5975 // in order to take into account the presence of possible cross plane paddings
5976 //
5977 // | |
5978 // | plane0 |
5979 // | |
5980 // |__________________|
5981 // |******************|
5982 // | cross_plane_pad |
5983 // |******************|
5984 // | |
5985 // | plane1 |
5986 // | |
5987 // |__________________|
5988
5989 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
5990 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5991 zin = min(DEPTH_GEMM3D - 1, zin);
5992
5993 // Add offset due to the cross plane paddings
5994 zin *= (src_cross_plane_pad * src0_stride_y);
5995
5996 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5997 // multiply src0_stride_z by DEPTH_GEMM3D
5998 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
5999
6000#else // defined(REINTERPRET_INPUT_AS_3D)
6001
Gian Marcoae2af742018-02-15 12:35:44 +00006002 // Add offset for batched GEMM
6003 src_addr.s0 += get_global_id(2) * src0_stride_z;
6004
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006005#endif // defined(REINTERPRET_INPUT_AS_3D)
6006
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006007#if defined(MATRIX_B_DEPTH)
6008 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6009 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6010#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006011 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006012#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006013
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006014 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006015 float4 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006016
6017#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006018 float4 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006019#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6020
6021#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006022 float4 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006023#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6024
6025#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006026 float4 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006027#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6028
6029 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006030 int i = 0;
6031 for(; i <= ((int)COLS_A - 4); i += 4)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006032 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006033#if defined(REINTERPRET_INPUT_AS_3D)
6034 // Load values from matrix A and matrix B
Usama Arif0681e3b2019-04-25 14:28:07 +01006035 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6036#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006037 // Load values from matrix A and matrix B
6038 float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006039#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006040 float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006041#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6042#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006043 float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006044#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6045#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006046 float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006047#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006048#endif // defined(REINTERPRET_INPUT_AS_3D)
6049
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006050 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6051 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006052
6053 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006054 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6055 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6056 acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
6057 acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006058
6059#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006060
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006061 acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
6062 acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
6063 acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
6064 acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006065
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006066#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6067#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006068
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006069 acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
6070 acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
6071 acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
6072 acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006073
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006074#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6075#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006076
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006077 acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
6078 acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
6079 acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
6080 acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006081#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006082
6083 // Load values from matrix A and matrix B
6084 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6085 src_addr.s1 += src1_stride_y;
6086
6087 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006088 acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
6089 acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
6090 acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
6091 acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006092
6093#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6094
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006095 acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
6096 acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
6097 acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
6098 acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006099
6100#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6101#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6102
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006103 acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
6104 acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
6105 acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
6106 acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006107
6108#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6109#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6110
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006111 acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
6112 acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
6113 acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
6114 acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006115#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6116
6117 // Load values from matrix A and matrix B
6118 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6119 src_addr.s1 += src1_stride_y;
6120
6121 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006122 acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
6123 acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
6124 acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
6125 acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006126
6127#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6128
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006129 acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
6130 acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
6131 acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
6132 acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006133
6134#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6135#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6136
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006137 acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
6138 acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
6139 acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
6140 acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006141
6142#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6143#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6144
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006145 acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
6146 acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
6147 acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
6148 acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006149#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6150
6151 // Load values from matrix A and matrix B
6152 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6153 src_addr.s1 += src1_stride_y;
6154
6155 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006156 acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
6157 acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
6158 acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
6159 acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006160
6161#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6162
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006163 acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
6164 acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
6165 acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
6166 acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006167
6168#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6169#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6170
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006171 acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
6172 acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
6173 acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
6174 acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006175
6176#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6177#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6178
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006179 acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
6180 acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
6181 acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
6182 acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006183#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6184
6185 src_addr.s0 += 4 * sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006186 }
6187
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006188 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006189 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006190#if defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006191 // Load values from matrix A
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006192 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6193#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6194 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6195#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6196#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6197 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6198#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6199#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6200 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6201#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6202#else // defined(REINTERPRET_INPUT_AS_3D)
6203 // Load values from matrix A
6204 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006205#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6206 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6207#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6208#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6209 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6210#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6211#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6212 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6213#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006214#endif // defined(REINTERPRET_INPUT_AS_3D)
6215
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006216 // Load values from matrix B
6217 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006218 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006219
6220 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006221 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6222 acc0.s1 = fma(a0, b0.s1, acc0.s1);
6223 acc0.s2 = fma(a0, b0.s2, acc0.s2);
6224 acc0.s3 = fma(a0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006225#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006226 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6227 acc1.s1 = fma(a1, b0.s1, acc1.s1);
6228 acc1.s2 = fma(a1, b0.s2, acc1.s2);
6229 acc1.s3 = fma(a1, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006230#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6231#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006232 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6233 acc2.s1 = fma(a2, b0.s1, acc2.s1);
6234 acc2.s2 = fma(a2, b0.s2, acc2.s2);
6235 acc2.s3 = fma(a2, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006236#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6237#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006238 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6239 acc3.s1 = fma(a3, b0.s1, acc3.s1);
6240 acc3.s2 = fma(a3, b0.s2, acc3.s2);
6241 acc3.s3 = fma(a3, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006242#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006243
6244 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006245 }
6246
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006247 int z = get_global_id(2);
6248
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006249 // Compute destination address
6250 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6251
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006252 // Compute dst address
6253 __global uchar *dst_addr = offset(&dst, 0, 0);
6254
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006255 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006256
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006257#if defined(REINTERPRET_OUTPUT_AS_3D)
6258 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006259 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006260 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006261 // | |
6262 // | plane0 |
6263 // | |
6264 // |__________________|
6265 // |******************|
6266 // | cross_plane_pad |
6267 // |******************|
6268 // | |
6269 // | plane1 |
6270 // | |
6271 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006272
6273 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006274 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6275 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006276
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006277 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006278 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006279
6280 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6281 // multiply dst_stride_z by DEPTH_GEMM3D
6282 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006283#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006284 // Add offset for batched GEMM
6285 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006286#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6287
6288 // Multiply by the weight of matrix-matrix product and store the result
6289#if defined(ALPHA)
6290 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6291#endif // defined(ALPHA)
6292
6293 // Add beta*bias
6294#if defined(BETA)
6295 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6296
6297#if defined(BROADCAST_BIAS)
6298 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
6299
6300 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6301
6302#ifndef UNIT_BETA
6303 SCALE_BLOCK(1, float, bias, BETA);
6304#endif // UNIT_BIAS
6305
6306 // acc = acc + bias[broadcasted]
6307 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6308
6309#else // defined(BROADCAST_BIAS)
6310 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *
6311 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6312
6313 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6314
6315#ifndef UNIT_BETA
6316 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6317#endif // UNIT_BIAS
6318
6319 // acc = acc + bias
6320 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6321
6322#endif // defined(BROADCAST_BIAS)
6323#endif // defined(BETA)
6324
6325#if defined(ACTIVATION_TYPE)
6326 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
6327#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006328
6329 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006330 vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006331#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006332 vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006333#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6334#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006335 vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006336#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6337#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006338 vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006339#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006340}
6341
6342/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
6343 *
6344 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
6345 * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
6346 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6347 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
6348 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6349 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006350 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6351 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006352 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006353 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6354 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006355 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6356 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006357 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6358 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6359 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6360 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6361 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006362 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006363 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6364 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6365 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6366 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6367 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6368 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6369 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6370 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6371 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6372 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6373 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006374 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6375 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6376 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6377 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6378 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6379 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006380 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6381 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6382 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6383 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6384 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6385 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006386 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6387 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006388 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006389 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006390 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6391 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006392 */
6393__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
6394 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006395#if defined(BETA)
6396 IMAGE_DECLARATION(src2),
6397#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00006398 IMAGE_DECLARATION(dst),
6399 uint src0_stride_z,
6400 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006401#if defined(BETA)
6402 uint src2_stride_z,
6403#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006404 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006405#if defined(REINTERPRET_INPUT_AS_3D)
6406 ,
6407 uint src_cross_plane_pad
6408#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006409#if defined(REINTERPRET_OUTPUT_AS_3D)
6410 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006411 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006412#endif // REINTERPRET_OUTPUT_AS_3D
6413 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006414{
6415 // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6416 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6417
6418 // Compute starting address for matrix A and Matrix B
6419 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6420
6421 // Update address for the matrix A
6422 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6423
6424 // Update address for the matrix B
6425 src_addr.s1 += idx * sizeof(float);
6426
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006427#if defined(REINTERPRET_INPUT_AS_3D)
6428 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6429 // in order to take into account the presence of possible cross plane paddings
6430 //
6431 // | |
6432 // | plane0 |
6433 // | |
6434 // |__________________|
6435 // |******************|
6436 // | cross_plane_pad |
6437 // |******************|
6438 // | |
6439 // | plane1 |
6440 // | |
6441 // |__________________|
6442
6443 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6444 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6445 zin = min(DEPTH_GEMM3D - 1, zin);
6446
6447 // Add offset due to the cross plane paddings
6448 zin *= (src_cross_plane_pad * src0_stride_y);
6449
6450 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6451 // multiply src0_stride_z by DEPTH_GEMM3D
6452 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6453
6454#else // defined(REINTERPRET_INPUT_AS_3D)
6455
Gian Marcoae2af742018-02-15 12:35:44 +00006456 // Add offset for batched GEMM
6457 src_addr.s0 += get_global_id(2) * src0_stride_z;
6458
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006459#endif // defined(REINTERPRET_INPUT_AS_3D)
6460
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006461#if defined(MATRIX_B_DEPTH)
6462 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6463 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6464#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006465 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006466#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006467
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006468 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006469 float2 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006470#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006471 float2 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006472#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6473#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006474 float2 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006475#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6476#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006477 float2 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006478#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6479
6480 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006481 int i = 0;
6482 for(; i <= ((int)COLS_A - 8); i += 8)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006483 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006484#if defined(REINTERPRET_INPUT_AS_3D)
6485 // Load values from matrix A
6486 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
6487#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006488 // Load values from matrix A
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006489 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006490#endif // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006491
6492 // Load values from matrix B
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006493 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6494 src_addr.s1 += src1_stride_y;
6495 float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6496 src_addr.s1 += src1_stride_y;
6497 float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6498 src_addr.s1 += src1_stride_y;
6499 float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6500 src_addr.s1 += src1_stride_y;
6501 float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6502 src_addr.s1 += src1_stride_y;
6503 float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6504 src_addr.s1 += src1_stride_y;
6505 float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6506 src_addr.s1 += src1_stride_y;
6507 float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6508 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006509
6510 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006511 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6512 acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
6513 acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
6514 acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
6515 acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
6516 acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
6517 acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
6518 acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006519
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006520 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6521 acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
6522 acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
6523 acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
6524 acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
6525 acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
6526 acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
6527 acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006528
6529#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006530#if defined(REINTERPRET_INPUT_AS_3D)
6531 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6532#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006533 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006534#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006535 acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
6536 acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
6537 acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
6538 acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
6539 acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
6540 acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
6541 acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
6542 acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006543
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006544 acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
6545 acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
6546 acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
6547 acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
6548 acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
6549 acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
6550 acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
6551 acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006552#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6553#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006554#if defined(REINTERPRET_INPUT_AS_3D)
6555 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6556#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006557 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006558#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006559 acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
6560 acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
6561 acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
6562 acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
6563 acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
6564 acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
6565 acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
6566 acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006567
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006568 acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
6569 acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
6570 acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
6571 acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
6572 acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
6573 acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
6574 acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
6575 acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006576#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6577#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006578#if defined(REINTERPRET_INPUT_AS_3D)
6579 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6580#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006581 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006582#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006583 acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
6584 acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
6585 acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
6586 acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
6587 acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
6588 acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
6589 acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
6590 acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006591
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006592 acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
6593 acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
6594 acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
6595 acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
6596 acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
6597 acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
6598 acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
6599 acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006600#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006601
6602 src_addr.s0 += sizeof(float) * 8;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006603 }
6604 // float size increment
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006605 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006606 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006607#if defined(REINTERPRET_INPUT_AS_3D)
6608 // Load values from matrix A
6609 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6610#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6611 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6612#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6613#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6614 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6615#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6616#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6617 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6618#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6619#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006620 // Load values from matrix A
6621 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6622#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6623 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6624#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6625#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6626 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6627#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6628#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6629 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6630#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006631#endif // defined(REINTERPRET_INPUT_AS_3D)
6632
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006633 // Load values from matrix B
6634 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006635 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006636
6637 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006638 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6639 acc0.s1 = fma(a0, b0.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006640#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006641 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6642 acc1.s1 = fma(a1, b0.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006643#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6644#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006645 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6646 acc2.s1 = fma(a2, b0.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006647#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6648#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006649 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6650 acc3.s1 = fma(a3, b0.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006651#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006652
6653 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006654 }
6655
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006656 int z = get_global_id(2);
6657
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006658 // Compute destination address
6659 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6660
Gian Marcoae2af742018-02-15 12:35:44 +00006661 // Compute dst address
6662 __global uchar *dst_addr = offset(&dst, 0, 0);
6663
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006664 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006665
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006666#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006667
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006668 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006669 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006670 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006671 // | |
6672 // | plane0 |
6673 // | |
6674 // |__________________|
6675 // |******************|
6676 // | cross_plane_pad |
6677 // |******************|
6678 // | |
6679 // | plane1 |
6680 // | |
6681 // |__________________|
Gian Marcoae2af742018-02-15 12:35:44 +00006682
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006683 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006684 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6685 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006686
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006687 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006688 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006689
6690 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6691 // multiply dst_stride_z by DEPTH_GEMM3D
6692 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006693#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006694 // Add offset for batched GEMM
6695 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006696#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6697
6698 // Multiply by the weight of matrix-matrix product and store the result
6699#if defined(ALPHA)
6700 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6701#endif // defined(ALPHA)
6702
6703 // Add beta*bias
6704#if defined(BETA)
6705 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6706
6707#if defined(BROADCAST_BIAS)
6708 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
6709
6710 LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6711
6712#ifndef UNIT_BETA
6713 SCALE_BLOCK(1, float, bias, BETA);
6714#endif // UNIT_BIAS
6715
6716 // acc = acc + bias[broadcasted]
6717 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6718
6719#else // defined(BROADCAST_BIAS)
6720 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *
6721 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6722
6723 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6724
6725#ifndef UNIT_BETA
6726 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6727#endif // UNIT_BIAS
6728
6729 // acc = acc + bias
6730 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6731
6732#endif // defined(BROADCAST_BIAS)
6733#endif // defined(BETA)
6734
6735#if defined(ACTIVATION_TYPE)
6736 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
6737#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006738
6739 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006740 vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006741#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006742 vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006743#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006745 vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006746#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6747#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006748 vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006750}
6751
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01006752#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01006753/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
6754 *
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006755 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
6756 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6757 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
6758 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6759 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006760 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6761 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006762 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006763 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6764 * The activation function is performed after the bias addition
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006765 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6766 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
6767 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6768 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6769 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6770 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6771 *
6772 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
6773 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6774 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6775 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6776 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6777 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6778 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6779 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6780 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6781 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6782 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6783 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006784 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6785 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6786 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6787 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6788 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6789 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006790 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6791 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6792 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6793 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6794 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6795 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
6796 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6797 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006798 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006799 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
6800 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6801 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
6802 */
6803__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
6804 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006805#if defined(BETA)
6806 IMAGE_DECLARATION(src2),
6807#endif // defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006808 IMAGE_DECLARATION(dst),
6809 uint src0_stride_z,
6810 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006811#if defined(BETA)
6812 uint src2_stride_z,
6813#endif //defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006814 uint dst_stride_z
6815#if defined(REINTERPRET_INPUT_AS_3D)
6816 ,
6817 uint src_cross_plane_pad
6818#endif // REINTERPRET_INPUT_AS_3D
6819#if defined(REINTERPRET_OUTPUT_AS_3D)
6820 ,
6821 uint dst_cross_plane_pad
6822#endif // REINTERPRET_OUTPUT_AS_3D
6823 )
6824{
6825 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6826
6827 // Compute starting address for matrix A and Matrix B
6828 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6829
6830 // Update address for the matrix A
6831 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6832
6833 // Update address for the matrix B
6834 src_addr.s1 += idx * sizeof(half);
6835
6836#if defined(REINTERPRET_INPUT_AS_3D)
6837 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6838 // in order to take into account the presence of possible cross plane paddings
6839 //
6840 // | |
6841 // | plane0 |
6842 // | |
6843 // |__________________|
6844 // |******************|
6845 // | cross_plane_pad |
6846 // |******************|
6847 // | |
6848 // | plane1 |
6849 // | |
6850 // |__________________|
6851
6852 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6853 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6854 zin = min(DEPTH_GEMM3D - 1, zin);
6855
6856 // Add offset due to the cross plane paddings
6857 zin *= (src_cross_plane_pad * src0_stride_y);
6858
6859 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6860 // multiply src0_stride_z by DEPTH_GEMM3D
6861 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6862
6863#else // defined(REINTERPRET_INPUT_AS_3D)
6864
6865 // Add offset for batched GEMM
6866 src_addr.s0 += get_global_id(2) * src0_stride_z;
6867
6868#endif // defined(REINTERPRET_INPUT_AS_3D)
6869
6870#if defined(MATRIX_B_DEPTH)
6871 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6872 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6873#else // defined(MATRIX_B_DEPTH)
6874 src_addr.s1 += get_global_id(2) * src1_stride_z;
6875#endif // defined(MATRIX_B_DEPTH)
6876
6877 float8 acc0 = 0.0h;
6878#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6879 float8 acc1 = 0.0h;
6880#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6881#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6882 float8 acc2 = 0.0h;
6883#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6884#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6885 float8 acc3 = 0.0h;
6886#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6887
6888 int i = 0;
6889 for(; i <= ((int)COLS_A - 4); i += 4)
6890 {
6891#if defined(REINTERPRET_INPUT_AS_3D)
6892 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01006893 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6894#else // defined(REINTERPRET_INPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006895 // Load values from matrix A
6896 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6897#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6898 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6899#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6900#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6901 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6902#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6903#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6904 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6905#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6906#endif // defined(REINTERPRET_INPUT_AS_3D)
6907
6908 // Load values from matrix B
6909 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6910 src_addr.s1 += src1_stride_y;
6911
6912 // Accumulate
6913 acc0 = fma(b0, (float8)a0.s0, acc0);
6914#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6915 acc1 = fma(b0, (float8)a1.s0, acc1);
6916#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6917#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6918 acc2 = fma(b0, (float8)a2.s0, acc2);
6919#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6920#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6921 acc3 = fma(b0, (float8)a3.s0, acc3);
6922#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6923
6924 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6925 src_addr.s1 += src1_stride_y;
6926 acc0 = fma(b0, (float8)a0.s1, acc0);
6927#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6928 acc1 = fma(b0, (float8)a1.s1, acc1);
6929#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6930#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6931 acc2 = fma(b0, (float8)a2.s1, acc2);
6932#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6933#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6934 acc3 = fma(b0, (float8)a3.s1, acc3);
6935#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6936
6937 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6938 src_addr.s1 += src1_stride_y;
6939 acc0 = fma(b0, (float8)a0.s2, acc0);
6940#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6941 acc1 = fma(b0, (float8)a1.s2, acc1);
6942#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6943#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6944 acc2 = fma(b0, (float8)a2.s2, acc2);
6945#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6946#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6947 acc3 = fma(b0, (float8)a3.s2, acc3);
6948#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6949
6950 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6951 src_addr.s1 += src1_stride_y;
6952 acc0 = fma(b0, (float8)a0.s3, acc0);
6953#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6954 acc1 = fma(b0, (float8)a1.s3, acc1);
6955#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6956#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6957 acc2 = fma(b0, (float8)a2.s3, acc2);
6958#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6959#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6960 acc3 = fma(b0, (float8)a3.s3, acc3);
6961#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6962
6963 src_addr.s0 += 4 * sizeof(half);
6964 }
6965
6966 for(; i < (int)COLS_A; ++i)
6967 {
6968#if defined(REINTERPRET_INPUT_AS_3D)
6969 // Load values from matrix A
6970 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6971#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6972 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6973#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6974#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6975 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6976#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6977#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6978 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6979#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6980#else // defined(REINTERPRET_INPUT_AS_3D)
6981 // Load values from matrix A
6982 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6983#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6984 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6985#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6986#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6987 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6988#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6989#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6990 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6991#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6992#endif // defined(REINTERPRET_INPUT_AS_3D)
6993
6994 // Load values from matrix B
6995 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6996
6997 src_addr += (int2)(sizeof(half), src1_stride_y);
6998
6999 // Accumulate
7000 acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
7001#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7002 acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
7003#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7004#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7005 acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
7006#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7007#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7008 acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
7009#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7010 }
7011
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007012 int z = get_global_id(2);
7013
7014 // Compute destination address
7015 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7016
7017 // Compute dst address
7018 __global uchar *dst_addr = offset(&dst, 0, 0);
7019
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007020 uint4 zout = 0;
7021
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007022#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007023
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007024 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
7025 // in order to take into account the presence of possible cross plane paddings
7026 //
7027 // | |
7028 // | plane0 |
7029 // | |
7030 // |__________________|
7031 // |******************|
7032 // | cross_plane_pad |
7033 // |******************|
7034 // | |
7035 // | plane1 |
7036 // | |
7037 // |__________________|
7038
7039 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007040 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7041 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007042
7043 // Add offset due to the cross plane paddings
7044 zout *= (dst_cross_plane_pad * dst_stride_y);
7045
7046 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7047 // multiply dst_stride_z by DEPTH_GEMM3D
7048 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007049#else // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007050 // Add offset for batched GEMM
7051 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007052#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007053
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007054 // Multiply by the weight of matrix-matrix product and store the result
7055#if defined(ALPHA)
7056 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
7057#endif // defined(ALPHA)
7058
7059#if defined(BETA)
7060 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7061
7062#if defined(BROADCAST_BIAS)
7063 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7064
7065 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7066
7067 float8 bias_f0 = convert_float8(bias0);
7068
7069#ifndef UNIT_BETA
7070 SCALE_BLOCK(1, float, bias_f, BETA);
7071#endif // UNIT_BIAS
7072
7073 // acc = acc + bias[broadcasted]
7074 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
7075
7076#else // defined(BROADCAST_BIAS)
7077 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7078 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7079
7080 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7081
7082 float8 bias_f0 = convert_float8(bias0);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007083#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007084 float8 bias_f1 = convert_float8(bias1);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007085#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7086#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007087 float8 bias_f2 = convert_float8(bias2);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007088#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7089#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007090 float8 bias_f3 = convert_float8(bias3);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007091#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007092
7093#ifndef UNIT_BETA
7094 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
7095#endif // UNIT_BIAS
7096
7097 // acc = acc + bias
7098 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
7099
7100#endif // defined(BROADCAST_BIAS)
7101#endif // defined(BETA)
7102
7103 half8 acc_h0 = convert_half8(acc0);
7104#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7105 half8 acc_h1 = convert_half8(acc1);
7106#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7107#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7108 half8 acc_h2 = convert_half8(acc2);
7109#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7110#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7111 half8 acc_h3 = convert_half8(acc3);
7112#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7113
7114#if defined(ACTIVATION_TYPE)
7115 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
7116#endif // defined(ACTIVATION_TYPE)
7117
7118 // Store the output block
7119 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007120}
7121
7122/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
7123 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007124 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
7125 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
7126 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
7127 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
7128 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007129 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
7130 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007131 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007132 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
7133 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007134 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
7135 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007136 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
7137 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
7138 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
7139 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
7140 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007141 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
7142 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7143 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7144 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7145 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7146 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
7147 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
7148 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7149 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7150 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7151 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7152 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007153 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
7154 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
7155 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
7156 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
7157 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
7158 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007159 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
7160 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7161 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7162 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7163 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7164 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007165 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
7166 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007167 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007168 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007169 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
7170 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007171 */
7172__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
7173 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007174#if defined(BETA)
7175 IMAGE_DECLARATION(src2),
7176#endif // defined(BETA)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007177 IMAGE_DECLARATION(dst),
7178 uint src0_stride_z,
7179 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007180#if defined(BETA)
7181 uint src2_stride_z,
7182#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007183 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007184#if defined(REINTERPRET_INPUT_AS_3D)
7185 ,
7186 uint src_cross_plane_pad
7187#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007188#if defined(REINTERPRET_OUTPUT_AS_3D)
7189 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007190 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007191#endif // REINTERPRET_OUTPUT_AS_3D
7192 )
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007193{
7194 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
7195
7196 // Compute starting address for matrix A and Matrix B
7197 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
7198
7199 // Update address for the matrix A
7200 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
7201
7202 // Update address for the matrix B
7203 src_addr.s1 += idx * sizeof(half);
7204
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007205#if defined(REINTERPRET_INPUT_AS_3D)
7206 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
7207 // in order to take into account the presence of possible cross plane paddings
7208 //
7209 // | |
7210 // | plane0 |
7211 // | |
7212 // |__________________|
7213 // |******************|
7214 // | cross_plane_pad |
7215 // |******************|
7216 // | |
7217 // | plane1 |
7218 // | |
7219 // |__________________|
7220
7221 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
7222 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7223 zin = min(DEPTH_GEMM3D - 1, zin);
7224
7225 // Add offset due to the cross plane paddings
7226 zin *= (src_cross_plane_pad * src0_stride_y);
7227
7228 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7229 // multiply src0_stride_z by DEPTH_GEMM3D
7230 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
7231
7232#else // defined(REINTERPRET_INPUT_AS_3D)
7233
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007234 // Add offset for batched GEMM
7235 src_addr.s0 += get_global_id(2) * src0_stride_z;
7236
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007237#endif // defined(REINTERPRET_INPUT_AS_3D)
7238
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007239#if defined(MATRIX_B_DEPTH)
7240 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
7241 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
7242#else // defined(MATRIX_B_DEPTH)
7243 src_addr.s1 += get_global_id(2) * src1_stride_z;
7244#endif // defined(MATRIX_B_DEPTH)
7245
7246 half8 acc0 = 0.0h;
7247#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7248 half8 acc1 = 0.0h;
7249#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7250#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7251 half8 acc2 = 0.0h;
7252#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7253#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7254 half8 acc3 = 0.0h;
7255#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7256
7257 int i = 0;
7258 for(; i <= ((int)COLS_A - 4); i += 4)
7259 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007260#if defined(REINTERPRET_INPUT_AS_3D)
7261 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01007262 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
7263#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007264 // Load values from matrix A
7265 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7266#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7267 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7268#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7269#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7270 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7271#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7272#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7273 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7274#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007275#endif // defined(REINTERPRET_INPUT_AS_3D)
7276
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007277 // Load values from matrix B
7278 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7279 src_addr.s1 += src1_stride_y;
7280
7281 // Accumulate
7282 acc0 = fma(b0, (half8)a0.s0, acc0);
7283#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7284 acc1 = fma(b0, (half8)a1.s0, acc1);
7285#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7286#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7287 acc2 = fma(b0, (half8)a2.s0, acc2);
7288#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7289#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7290 acc3 = fma(b0, (half8)a3.s0, acc3);
7291#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7292
7293 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7294 src_addr.s1 += src1_stride_y;
7295 acc0 = fma(b0, (half8)a0.s1, acc0);
7296#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7297 acc1 = fma(b0, (half8)a1.s1, acc1);
7298#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7299#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7300 acc2 = fma(b0, (half8)a2.s1, acc2);
7301#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7302#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7303 acc3 = fma(b0, (half8)a3.s1, acc3);
7304#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7305
7306 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7307 src_addr.s1 += src1_stride_y;
7308 acc0 = fma(b0, (half8)a0.s2, acc0);
7309#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7310 acc1 = fma(b0, (half8)a1.s2, acc1);
7311#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7312#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7313 acc2 = fma(b0, (half8)a2.s2, acc2);
7314#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7315#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7316 acc3 = fma(b0, (half8)a3.s2, acc3);
7317#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7318
7319 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7320 src_addr.s1 += src1_stride_y;
7321 acc0 = fma(b0, (half8)a0.s3, acc0);
7322#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7323 acc1 = fma(b0, (half8)a1.s3, acc1);
7324#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7325#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7326 acc2 = fma(b0, (half8)a2.s3, acc2);
7327#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7328#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7329 acc3 = fma(b0, (half8)a3.s3, acc3);
7330#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7331
7332 src_addr.s0 += 4 * sizeof(half);
7333 }
7334
7335 for(; i < (int)COLS_A; ++i)
7336 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007337#if defined(REINTERPRET_INPUT_AS_3D)
7338 // Load values from matrix A
7339 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
7340#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7341 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
7342#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7343#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7344 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
7345#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7346#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7347 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
7348#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7349#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007350 // Load values from matrix A
7351 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7352#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7353 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7354#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7355#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7356 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7357#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7358#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7359 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7360#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007361#endif // defined(REINTERPRET_INPUT_AS_3D)
7362
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007363 // Load values from matrix B
7364 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7365
7366 src_addr += (int2)(sizeof(half), src1_stride_y);
7367
7368 // Accumulate
7369 acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
7370#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7371 acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
7372#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7373#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7374 acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
7375#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7376#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7377 acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
7378#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7379 }
7380
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007381 int z = get_global_id(2);
7382
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007383 // Compute destination address
7384 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7385
7386 // Compute dst address
7387 __global uchar *dst_addr = offset(&dst, 0, 0);
7388
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007389 uint4 zout = 0;
7390
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007391#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007392
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007393 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007394 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007395 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007396 // | |
7397 // | plane0 |
7398 // | |
7399 // |__________________|
7400 // |******************|
7401 // | cross_plane_pad |
7402 // |******************|
7403 // | |
7404 // | plane1 |
7405 // | |
7406 // |__________________|
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007407
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007408 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007409 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7410 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007411
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007412 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007413 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007414
7415 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7416 // multiply dst_stride_z by DEPTH_GEMM3D
7417 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007418#else // defined(REINTERPRET_OUTPUT_AS_3D)
7419 // Add offset for batched GEMM
7420 dst_addr += z * dst_stride_z;
7421#endif // defined(REINTERPRET_OUTPUT_AS_3D)
7422
7423 // Multiply by the weight of matrix-matrix product and store the result
7424#if defined(ALPHA)
7425 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
7426#endif // defined(ALPHA)
7427
7428 // Add beta*bias
7429#if defined(BETA)
7430 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7431
7432#if defined(BROADCAST_BIAS)
7433 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7434
7435 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7436
7437#ifndef UNIT_BETA
7438 SCALE_BLOCK(1, half, bias, BETA);
7439#endif // UNIT_BIAS
7440
7441 // acc = acc + bias[broadcasted]
7442 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
7443
7444#else // defined(BROADCAST_BIAS)
7445 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7446 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7447
7448 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7449
7450#ifndef UNIT_BETA
7451 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
7452#endif // UNIT_BIAS
7453
7454 // acc = acc + bias
7455 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
7456
7457#endif // defined(BROADCAST_BIAS)
7458#endif // defined(BETA)
7459
7460#if defined(ACTIVATION_TYPE)
7461 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
7462#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007463
7464 // Store the output block
Usama Arif0681e3b2019-04-25 14:28:07 +01007465 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007466}
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01007467#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007468
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01007469#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007470
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007471#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007472/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7473 *
Gian Marco19835e52018-01-30 13:35:54 +00007474 * @note The beta's value need to be passed at compile time using -DBETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007475 *
7476 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
7477 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7478 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7479 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7480 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007481 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7482 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007483 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007484 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007485 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7486 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7487 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7488 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007489 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7490 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007491 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7492 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007493__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
7494 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007495{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007496 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007497 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7498 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007499
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007500 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007501 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
7502
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007503 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007504 float4 c = vload4(0, (__global float *)src.ptr);
7505
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007506 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007507 float4 out = alpha_ab + (float4)BETA * c;
7508
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007509 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007510 vstore4(out, 0, (__global float *)dst.ptr);
7511}
7512
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007513#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007514/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7515 *
Gian Marco19835e52018-01-30 13:35:54 +00007516 * @note The beta's value need to be passed at compile time using -DBETA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007517 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007518 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
7519 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7520 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7521 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7522 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007523 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7524 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007525 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007526 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007527 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7528 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7529 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7530 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007531 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7532 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007533 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7534 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007535__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
7536 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007537{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007538 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007539 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7540 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007541
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007542 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007543 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
7544
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007545 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007546 half8 c = vload8(0, (__global half *)src.ptr);
7547
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007548 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007549 half8 out = alpha_ab + (half8)BETA * c;
7550
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007551 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007552 vstore8(out, 0, (__global half *)dst.ptr);
7553}
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007554#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007555#endif // defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007556
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007557#if defined(WIDTH_VECTOR_A)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007558/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
7559 *
Gian Marco19835e52018-01-30 13:35:54 +00007560 * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007561 *
Gian Marco19835e52018-01-30 13:35:54 +00007562 * @note The input A and matrix B must not be reshaped
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007563 *
7564 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
7565 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7566 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7567 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7568 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7569 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007570 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007571 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7572 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7573 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7574 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7575 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
7576 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
7577 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007578 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007579 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7580 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7581 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7582 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7583 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7584 */
7585__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
7586 TENSOR3D_DECLARATION(src1),
7587 IMAGE_DECLARATION(dst))
7588{
7589 int idx = get_global_id(0) * 4;
7590 int idy = get_global_id(1);
7591
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007592 // Compute the address for the vector A and matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007593 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
7594 src_addr.s1 += idx * sizeof(float);
7595
7596 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
7597
7598 float4 acc = 0.0f;
7599
Georgios Pinitas96880cf2017-10-20 18:52:20 +01007600 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007601 {
7602 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
7603 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7604 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
7605
7606 acc += b0 * (float4)a0.s0;
7607 acc += b1 * (float4)a0.s1;
7608 }
7609
7610 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
7611 {
7612 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
7613 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7614
7615 acc += b0 * (float4)a0;
7616 }
7617
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007618 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007619 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7620
7621 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
7622}
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007623#endif // defined(WIDTH_VECTOR_A)