blob: 653aa5591cc57d80c396be481b6905407b4a3d67 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif0681e3b2019-04-25 14:28:07 +010024#include "gemm_helpers.h"
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +000025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010027#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +000028#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
29#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
30#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
31#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
32#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
33#define CONCAT_INC(K0) INC##K0
34#define INC(K0) CONCAT_INC(K0)
35
36#if(SRC_WIDTH % K0)
37#define BOUNDARY_CONDITION_X(x, a) \
38 ({ \
39 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
40 })
41#else // (SRC_WIDTH % K0)
42#define BOUNDARY_CONDITION_X(x, a) \
43 ({})
44#endif // (SRC_WIDTH % K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000045
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010046#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
47 ({ \
48 if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
49 { \
50 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
51 { \
52 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
53 } \
54 else \
55 { \
56 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
57 } \
58 } \
59 else \
60 { \
61 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
62 { \
63 LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
64 } \
65 else \
66 { \
67 LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
68 } \
69 } \
70 })
71
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000072/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
73 * the output matrix unrolling the values.
74 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010075 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
76 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010077 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010078 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
79 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010080 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
81 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000082 * @note Only the following values for M0, K0 and V0 are supported:
83 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +000084 * K0: 2,3,4,8,16
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000085 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010086 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000087 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
88 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
89 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
90 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
91 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
92 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010093 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000094 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
95 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
96 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
97 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
98 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
99 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
100 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
101 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
102 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
103 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
104 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
105 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
106 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
107 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
108 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
109 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
110 */
111__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
112 TENSOR3D_DECLARATION(dst)
113#if defined(REINTERPRET_INPUT_AS_3D)
114 ,
115 uint cross_plane_pad
116#endif // REINTERPRET_INPUT_AS_3D
117 )
118{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000119 // Block size
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000120#define BLOCK_SIZE ((M0) * (K0))
121
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000122 // Output offset X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000123#if defined(INTERLEAVE)
124#define OUTPUT_OFFSET_X (K0)
125#else // defined(INTERLEAVE)
126#define OUTPUT_OFFSET_X (BLOCK_SIZE)
127#endif // defined(INTERLEAVE)
128
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000129 // Output step X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000130#if defined(INTERLEAVE)
131#define OUTPUT_STEP_X (K0) * (V0)
132#else // Do not interleave
133#define OUTPUT_STEP_X (K0)
134#endif // defined(INTERLEAVE)
135
136 // Compute source and destination addresses
137 uint x = get_global_id(0);
138 uint y = get_global_id(1);
139 uint z = get_global_id(2);
140
141 // ------------------ Compute input/output addresses ---------------------------
142
143 // Compute the input address
144 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
145
146 // Compute the output address
147 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
148 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
149
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000150 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
151 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000152
153#if defined(REINTERPRET_INPUT_AS_3D)
154 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
155 // multiply src_stride_z by DEPTH_GEMM3D
156
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000157 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
158
159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100160 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000161
162#else // defined(REINTERPRET_INPUT_AS_3D)
163
164 input_ptr += z * (uint)src_stride_z;
165
166#endif // defined(REINTERPRET_INPUT_AS_3D)
167
168 // Add offset for batched GEMM
169 output_ptr += z * (uint)dst_stride_z;
170
171 // ---------------------------Load input values --------------------------------
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000172 // Load values from the LHS matrix
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100173 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
174
175 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
176
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000177 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100178 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
179 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000180
181#undef BLOCK_SIZE
182#undef OUTPUT_OFFSET_X
183#undef OUTPUT_STEP_X
184}
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000185
186#if M0 == 2
187#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
188 ({ \
189 VEC_DATA_TYPE(DATA_TYPE, M0) \
190 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
191 VSTORE(M0) \
192 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
193 })
194#elif M0 == 3 // M0 == 3
195#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
196 ({ \
197 VEC_DATA_TYPE(DATA_TYPE, M0) \
198 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
199 VSTORE(M0) \
200 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
201 })
202#elif M0 == 4 // M0 == 4
203#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
204 ({ \
205 VEC_DATA_TYPE(DATA_TYPE, M0) \
206 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
207 VSTORE(M0) \
208 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
209 })
210#elif M0 == 5 // M0 == 5
211#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
212 ({ \
213 VEC_DATA_TYPE(DATA_TYPE, 4) \
214 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
215 DATA_TYPE res1 = a4.s##i; \
216 VSTORE(4) \
217 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
218 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
219 })
220#elif M0 == 6 // M0 == 6
221#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
222 ({ \
223 VEC_DATA_TYPE(DATA_TYPE, 4) \
224 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
225 VEC_DATA_TYPE(DATA_TYPE, 2) \
226 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
227 VSTORE(4) \
228 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
229 VSTORE(2) \
230 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
231 })
232#elif M0 == 7 // M0 == 7
233#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
234 ({ \
235 VEC_DATA_TYPE(DATA_TYPE, 4) \
236 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
237 VEC_DATA_TYPE(DATA_TYPE, 3) \
238 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
239 VSTORE(4) \
240 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
241 VSTORE(3) \
242 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
243 })
244#elif M0 == 8 // M0 == 8
245#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
246 ({ \
247 VEC_DATA_TYPE(DATA_TYPE, M0) \
248 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
249 VSTORE(M0) \
250 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
251 })
252#else // M0 not supported
253#error "M0 value not supported"
254#endif // N0 conditions
255
256/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
257 * the output matrix unrolling the values.
258 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100259 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
260 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100261 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100262 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
263 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100264 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
265 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000266 * @note Only the following values for M0, K0 and V0 are supported:
267 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000268 * K0: 2,3,4,8,16
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000269 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100270 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000271 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
272 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
273 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
274 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
275 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
276 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100277 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000278 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
279 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
281 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
283 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
284 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
285 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
286 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
287 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
289 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
291 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
292 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
293 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
294 */
295__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
296 TENSOR3D_DECLARATION(dst)
297#if defined(REINTERPRET_INPUT_AS_3D)
298 ,
299 uint cross_plane_pad
300#endif // REINTERPRET_INPUT_AS_3D
301 )
302{
303 // Block size
304#define BLOCK_SIZE ((M0) * (K0))
305
306 // Output offset X
307#if defined(INTERLEAVE)
308#define OUTPUT_OFFSET_X (M0)
309#else // defined(INTERLEAVE)
310#define OUTPUT_OFFSET_X (BLOCK_SIZE)
311#endif // defined(INTERLEAVE)
312
313 // Output step X
314#if defined(INTERLEAVE)
315#define OUTPUT_STEP_X (M0) * (V0)
316#else // Do not interleave
317#define OUTPUT_STEP_X (M0)
318#endif // defined(INTERLEAVE)
319
320 // Compute source and destination addresses
321 uint x = get_global_id(0);
322 uint y = get_global_id(1);
323 uint z = get_global_id(2);
324
325 // ------------------ Compute input/output addresses ---------------------------
326
327 // Compute the input address
328 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
329
330 // Compute the output address
331 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
332 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
333
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000334 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
335 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000336
337#if defined(REINTERPRET_INPUT_AS_3D)
338 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
339 // multiply src_stride_z by DEPTH_GEMM3D
340
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000341 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
342
343 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100344 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000345
346#else // defined(REINTERPRET_INPUT_AS_3D)
347
348 input_ptr += z * (uint)src_stride_z;
349
350#endif // defined(REINTERPRET_INPUT_AS_3D)
351
352 // Add offset for batched GEMM
353 output_ptr += z * (uint)dst_stride_z;
354
355 // ---------------------------Load input values --------------------------------
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100356 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000357
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100358 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
359
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000360 // ---------------------------Transpose and store block -----------------------
361
362 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
363 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
364#if K0 > 2
365 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000366#endif // K0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000367#if K0 > 3
368 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
369#endif // K0 > 3
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000370#if K0 > 4
371 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
372 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
374 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
375#endif // K0 > 4
376#if K0 > 8
377 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
378 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
383 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
384 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
385#endif // K0 > 8
386
387#undef BLOCK_SIZE
388#undef OUTPUT_OFFSET_X
389#undef OUTPUT_STEP_X
390}
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100391#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000392
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000393#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
394/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
395 * the output matrix unrolling the values.
396 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100397 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
398 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
399 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
400 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000401 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
402 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000403 * N0: 2,3,4,8,16
404 * K0: 1,2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000405 * H0: greater than 0
406 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100407 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000408 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
409 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
410 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
411 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
412 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
413 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
414 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
415 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
416 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
417 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
419 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
421 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
423 */
424__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
425 TENSOR3D_DECLARATION(dst))
426{
427 // Block size
428#define BLOCK_SIZE ((K0) * (N0))
429
430 // Output offset X
431#if defined(INTERLEAVE)
432#define OUTPUT_OFFSET_X (N0)
433#else // defined(INTERLEAVE)
434#define OUTPUT_OFFSET_X (BLOCK_SIZE)
435#endif // defined(INTERLEAVE)
436
437 // Output step X
438#if defined(INTERLEAVE)
439#define OUTPUT_STEP_X (N0) * (H0)
440#else // Do not interleave
441#define OUTPUT_STEP_X (N0)
442#endif // defined(INTERLEAVE)
443
444 // Compute source and destination addresses
445 uint x = get_global_id(0);
446 uint y = get_global_id(1);
447 uint z = get_global_id(2);
448
449 // ------------------ Compute input/output addresses ---------------------------
450
451 // Compute the input address
452 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
453
454 // Compute the output address
455 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
456 x / (uint)H0)
457 * (uint)dst_stride_y)
458 + z * (uint)dst_stride_z;
459
460 // ---------------------------Load input values --------------------------------
461
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000462 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000463
464 // Load values from the RHS matrix
465 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
466#if K0 > 1
467 if(y * (uint)K0 + 1 < SRC_HEIGHT)
468 {
469 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
470 }
471#endif // K0 > 1
472#if K0 > 2
473 if(y * (uint)K0 + 2 < SRC_HEIGHT)
474 {
475 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
476 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000477#endif // K0 > 2
478#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000479 if(y * (uint)K0 + 3 < SRC_HEIGHT)
480 {
481 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
482 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000483#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000484#if K0 > 4
485 if(y * (uint)K0 + 4 < SRC_HEIGHT)
486 {
487 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
488 }
489 if(y * (uint)K0 + 5 < SRC_HEIGHT)
490 {
491 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
492 }
493 if(y * (uint)K0 + 6 < SRC_HEIGHT)
494 {
495 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
496 }
497 if(y * (uint)K0 + 7 < SRC_HEIGHT)
498 {
499 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
500 }
501#endif // K0 > 4
502#if K0 > 8
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000503 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000504 {
505 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
506 }
507 if(y * (uint)K0 + 9 < SRC_HEIGHT)
508 {
509 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
510 }
511 if(y * (uint)K0 + 10 < SRC_HEIGHT)
512 {
513 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
514 }
515 if(y * (uint)K0 + 11 < SRC_HEIGHT)
516 {
517 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
518 }
519 if(y * (uint)K0 + 12 < SRC_HEIGHT)
520 {
521 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
522 }
523 if(y * (uint)K0 + 13 < SRC_HEIGHT)
524 {
525 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
526 }
527 if(y * (uint)K0 + 14 < SRC_HEIGHT)
528 {
529 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
530 }
531 if(y * (uint)K0 + 15 < SRC_HEIGHT)
532 {
533 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
534 }
535#endif // K0 > 8
536
537 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100538 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
539 STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000540
541#undef BLOCK_SIZE
542#undef OUTPUT_OFFSET_X
543#undef OUTPUT_STEP_X
544}
545
546#if defined(TRANSPOSE)
547/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
548 * the output matrix unrolling the values.
549 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100550 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
551 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
552 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
553 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000554 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
555 * @note The option -DTRANSPOSE must passed at compile time.
556 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000557 * N0: 2,3,4,8,16
558 * K0: 2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000559 * H0: greater than 0
560 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100561 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000562 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
563 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
564 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
565 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
566 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
567 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
569 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
574 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
575 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
576 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
577 */
578__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
579 TENSOR3D_DECLARATION(dst))
580{
581 // Block size
582#define BLOCK_SIZE ((K0) * (N0))
583
584 // Output offset X
585#if defined(INTERLEAVE)
586#define OUTPUT_OFFSET_X (K0)
587#else // defined(INTERLEAVE)
588#define OUTPUT_OFFSET_X (BLOCK_SIZE)
589#endif // defined(INTERLEAVE)
590
591 // Output step X
592#if defined(INTERLEAVE)
593#define OUTPUT_STEP_X (K0) * (H0)
594#else // Do not interleave
595#define OUTPUT_STEP_X (K0)
596#endif // defined(INTERLEAVE)
597
598 // Compute source and destination addresses
599 uint x = get_global_id(0);
600 uint y = get_global_id(1);
601 uint z = get_global_id(2);
602
603 // ------------------ Compute input/output addresses ---------------------------
604
605 // Compute the input address
606 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
607
608 // Compute the output address
609 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
610 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
611
612 // ---------------------------Load input values --------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000613 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000614
615 // Load values from the RHS matrix
616 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
617 if(y * (uint)K0 + 1 < SRC_HEIGHT)
618 {
619 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
620 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000621#if K0 > 2
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000622 if(y * (uint)K0 + 2 < SRC_HEIGHT)
623 {
624 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
625 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000626#endif // K0 > 2
627#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000628 if(y * (uint)K0 + 3 < SRC_HEIGHT)
629 {
630 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
631 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000632#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000633#if K0 > 4
634 if(y * (uint)K0 + 4 < SRC_HEIGHT)
635 {
636 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
637 }
638 if(y * (uint)K0 + 5 < SRC_HEIGHT)
639 {
640 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
641 }
642 if(y * (uint)K0 + 6 < SRC_HEIGHT)
643 {
644 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
645 }
646 if(y * (uint)K0 + 7 < SRC_HEIGHT)
647 {
648 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
649 }
650#endif // K0 > 4
651#if K0 > 8
Gian Marco Iodice89124342018-12-19 14:17:22 +0000652 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000653 {
654 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
655 }
656 if(y * (uint)K0 + 9 < SRC_HEIGHT)
657 {
658 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
659 }
660 if(y * (uint)K0 + 10 < SRC_HEIGHT)
661 {
662 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
663 }
664 if(y * (uint)K0 + 11 < SRC_HEIGHT)
665 {
666 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
667 }
668 if(y * (uint)K0 + 12 < SRC_HEIGHT)
669 {
670 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
671 }
672 if(y * (uint)K0 + 13 < SRC_HEIGHT)
673 {
674 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
675 }
676 if(y * (uint)K0 + 14 < SRC_HEIGHT)
677 {
678 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
679 }
680 if(y * (uint)K0 + 15 < SRC_HEIGHT)
681 {
682 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
683 }
684#endif // K0 > 8
685
686 // ---------------------------Transpose the block ------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000687 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000688
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000689#if K0 == 2
690 // This part computes the following transpositions:
691 // 2x2 -> 2x2
692 // 2x4 -> 4x2
693 // 2x8 -> 8x2
694 // 2x16 -> 16x2
695 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
696 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
697#if N0 > 2
698 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
699#endif // N0 > 2
700#if N0 > 3
701 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
702#endif // N0 > 3
703#if N0 > 4
704 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
705 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
706 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
707 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
708#endif // N0 > 4
709#if N0 > 8
710 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
711 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
712 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
713 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
714 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
715 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
716 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
717 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
718#endif // N0 > 8
719
720#elif K0 == 3 // K0 == 2
721 // This part computes the following transpositions:
722 // 3x2 -> 2x3
723 // 3x4 -> 4x3
724 // 3x8 -> 8x3
725 // 3x16 -> 16x3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100726 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
727 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000728#if N0 > 2
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100729 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000730#endif // N0 > 2
731#if N0 > 3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100732 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000733#endif // N0 > 3
734#if N0 > 4
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100735 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
736 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
737 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
738 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000739#endif // N0 > 4
740#if N0 > 8
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100741 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
742 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
743 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
744 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
745 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
746 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
747 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
748 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000749#endif // N0 > 8
750
751#elif K0 == 4 // K0 == 4
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000752 // This part computes the following transpositions:
753 // 4x2 -> 2x4
754 // 4x4 -> 4x4
755 // 4x8 -> 8x4
756 // 4x16 -> 16x4
757 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
758 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
759#if N0 > 2
760 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000761#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000762#if N0 > 3
763 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
764#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000765#if N0 > 4
766 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
767 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
768 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
769 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
770#endif // N0 > 4
771#if N0 > 8
772 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
773 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
774 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
775 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
776 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
777 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
778 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
779 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
780#endif // N0 > 8
781
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000782#elif K0 == 8 // K0 == 8
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000783 // This part computes the following transpositions:
784 // 8x2 -> 2x8
785 // 8x4 -> 4x8
786 // 8x8 -> 8x8
787 // 8x16 -> 16x8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000788 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
789 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000790#if N0 > 2
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000791 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000792#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000793#if N0 > 3
794 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
795#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000796#if N0 > 4
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000797 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
798 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
799 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
800 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000801#endif // N0 > 4
802#if N0 > 8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000803 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
804 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
805 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
806 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
807 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
808 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
809 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
810 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000811#endif // N0 > 8
812
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000813#elif K0 == 16 // K0 == 16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000814
815 // This part computes the following transpositions:
816 // 16x2 -> 2x16
817 // 16x4 -> 4x16
818 // 16x8 -> 8x16
819 // 16x16 -> 16x16
820 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
821 a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
822 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
823 a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
824#if N0 > 2
825 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
826 a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000827#endif // N0 > 2
828#if N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000829 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
830 a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000831#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000832#if N0 > 4
833 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
834 a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
835 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
836 a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
837 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
838 a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
839 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
840 a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
841#endif // N0 > 4
842#if N0 > 8
843 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
844 a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
845 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
846 a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
847 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
848 a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
849 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
850 a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
851 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
852 a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
853 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
854 a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
855 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
856 a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
857 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
858 a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
859#endif // N0 > 8
860
861#else // N0 == 16
862#error "Not supported N0 value"
863#endif // N0 > 2
864
865 // ---------------------------Store the output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100866 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
867 STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000868
869#undef BLOCK_SIZE
870#undef OUTPUT_OFFSET_X
871#undef OUTPUT_STEP_X
872}
873#endif // defined(TRANSPOSE)
874#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
875
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000876#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +0000877
878#define CONCAT(a, b) a##b
879
880#define ARM_DOT1(a, b, c) \
881 ({ \
882 c = fma(a, b, c); \
883 })
884#define ARM_DOT2(a, b, c) \
885 ({ \
886 c = fma(a.s0, b.s0, c); \
887 c = fma(a.s1, b.s1, c); \
888 })
889#define ARM_DOT3(a, b, c) \
890 ({ \
891 ARM_DOT2(a, b, c); \
892 c = fma((a.s2), (b.s2), c); \
893 })
894#define ARM_DOT4(a, b, c) \
895 ({ \
896 ARM_DOT3(a, b, c); \
897 c = fma((a.s3), (b.s3), c); \
898 })
899#define ARM_DOT8(a, b, c) \
900 ({ \
901 ARM_DOT4((a.lo), (b.lo), c); \
902 ARM_DOT4((a.hi), (b.hi), c); \
903 })
904#define ARM_DOT16(a, b, c) \
905 ({ \
906 ARM_DOT8((a.lo), (b.lo), c); \
907 ARM_DOT8((a.hi), (b.hi), c); \
908 })
909
910#if N0 == 2
911#define ARM_DOT_K0XN0(k0, a, b, c) \
912 ({ \
913 CONCAT(ARM_DOT, k0) \
914 ((a), (b##0), (c.s0)); \
915 CONCAT(ARM_DOT, k0) \
916 ((a), (b##1), (c.s1)); \
917 })
918#elif N0 == 3 // N0 == 3
919#define ARM_DOT_K0XN0(k0, a, b, c) \
920 ({ \
921 CONCAT(ARM_DOT, k0) \
922 ((a), (b##0), (c.s0)); \
923 CONCAT(ARM_DOT, k0) \
924 ((a), (b##1), (c.s1)); \
925 CONCAT(ARM_DOT, k0) \
926 ((a), (b##2), (c.s2)); \
927 })
928#elif N0 == 4 // N0 == 4
929#define ARM_DOT_K0XN0(k0, a, b, c) \
930 ({ \
931 CONCAT(ARM_DOT, k0) \
932 ((a), (b##0), (c.s0)); \
933 CONCAT(ARM_DOT, k0) \
934 ((a), (b##1), (c.s1)); \
935 CONCAT(ARM_DOT, k0) \
936 ((a), (b##2), (c.s2)); \
937 CONCAT(ARM_DOT, k0) \
938 ((a), (b##3), (c.s3)); \
939 })
940#elif N0 == 8 // N0 == 8
941#define ARM_DOT_K0XN0(k0, a, b, c) \
942 ({ \
943 CONCAT(ARM_DOT, k0) \
944 ((a), (b##0), (c.s0)); \
945 CONCAT(ARM_DOT, k0) \
946 ((a), (b##1), (c.s1)); \
947 CONCAT(ARM_DOT, k0) \
948 ((a), (b##2), (c.s2)); \
949 CONCAT(ARM_DOT, k0) \
950 ((a), (b##3), (c.s3)); \
951 CONCAT(ARM_DOT, k0) \
952 ((a), (b##4), (c.s4)); \
953 CONCAT(ARM_DOT, k0) \
954 ((a), (b##5), (c.s5)); \
955 CONCAT(ARM_DOT, k0) \
956 ((a), (b##6), (c.s6)); \
957 CONCAT(ARM_DOT, k0) \
958 ((a), (b##7), (c.s7)); \
959 })
960#elif N0 == 16 // N0 == 16
961#define ARM_DOT_K0XN0(k0, a, b, c) \
962 ({ \
963 CONCAT(ARM_DOT, k0) \
964 ((a), (b##0), (c.s0)); \
965 CONCAT(ARM_DOT, k0) \
966 ((a), (b##1), (c.s1)); \
967 CONCAT(ARM_DOT, k0) \
968 ((a), (b##2), (c.s2)); \
969 CONCAT(ARM_DOT, k0) \
970 ((a), (b##3), (c.s3)); \
971 CONCAT(ARM_DOT, k0) \
972 ((a), (b##4), (c.s4)); \
973 CONCAT(ARM_DOT, k0) \
974 ((a), (b##5), (c.s5)); \
975 CONCAT(ARM_DOT, k0) \
976 ((a), (b##6), (c.s6)); \
977 CONCAT(ARM_DOT, k0) \
978 ((a), (b##7), (c.s7)); \
979 CONCAT(ARM_DOT, k0) \
980 ((a), (b##8), (c.s8)); \
981 CONCAT(ARM_DOT, k0) \
982 ((a), (b##9), (c.s9)); \
983 CONCAT(ARM_DOT, k0) \
984 ((a), (b##A), (c.sA)); \
985 CONCAT(ARM_DOT, k0) \
986 ((a), (b##B), (c.sB)); \
987 CONCAT(ARM_DOT, k0) \
988 ((a), (b##C), (c.sC)); \
989 CONCAT(ARM_DOT, k0) \
990 ((a), (b##D), (c.sD)); \
991 CONCAT(ARM_DOT, k0) \
992 ((a), (b##E), (c.sE)); \
993 CONCAT(ARM_DOT, k0) \
994 ((a), (b##F), (c.sF)); \
995 })
996#else // N0 not supported
997#error "N0 value not supported"
998#endif // N0 conditions
999
1000/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1001 * The LHS matrix is NOT reshaped
1002 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1003 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001004 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001005 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1006 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
1007 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1008 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1009 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001010 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001011 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1012 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001013 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1014 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1015 * - N0 = 2, 3, 4, 8, 16
1016 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001017 * - H0 >= 1
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001018 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001019 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001020 * The activation function is performed after the bias addition
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001021 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1022 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1023 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1024 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1025 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1026 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1027 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001028 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1029 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001030 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001031 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001032 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001033 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001034 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1035 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1036 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1037 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1038 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1039 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001040 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1041 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1042 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1044 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001046 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1047 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1048 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1050 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001052 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001053 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001054 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001055 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1056 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1057 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001058 */
1059__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1060 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001061#if defined(BETA)
1062 IMAGE_DECLARATION(bias),
1063#endif // defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001064 IMAGE_DECLARATION(dst),
1065 uint lhs_stride_z,
1066 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001067#if defined(BETA)
1068 uint bias_stride_z,
1069#endif //defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001070 uint dst_stride_z
1071#if defined(REINTERPRET_INPUT_AS_3D)
1072 ,
1073 uint lhs_cross_plane_pad
1074#endif // REINTERPRET_INPUT_AS_3D
1075#if defined(REINTERPRET_OUTPUT_AS_3D)
1076 ,
1077 uint dst_cross_plane_pad
1078#endif // REINTERPRET_OUTPUT_AS_3D
1079 )
1080{
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001081 // Block size
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001082#define RHS_BLOCK_SIZE ((K0) * (N0))
1083
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001084 // RHS offset and step X
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001085#if defined(RHS_INTERLEAVE)
1086#define RHS_OFFSET_X (K0)
1087#define RHS_STEP_X ((K0) * (H0))
1088#define RHS_STEP_LOOP (1)
1089#else // defined(RHS_INTERLEAVE)
1090#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1091#define RHS_STEP_X (K0)
1092#define RHS_STEP_LOOP (H0)
1093#endif // defined(RHS_INTERLEAVE)
1094
1095 uint x = get_global_id(0);
1096 uint y = get_global_id(1);
1097 uint z = get_global_id(2);
1098
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001099#if defined(DUMMY_WORK_ITEMS)
1100 if((x * N0 >= N) || (y * M0 >= M))
1101 {
1102 return;
1103 }
1104#endif // defined(DUMMY_WORK_ITEMS)
1105
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001106 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001107 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001108
Sheri Zhang1a378102020-04-30 12:59:39 +01001109 // Compute RHS reshaped matrix address
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001110 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1111
1112#if defined(MATRIX_B_DEPTH)
1113 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1114 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1115#else // defined(MATRIX_B_DEPTH)
1116 rhs_offset += z * rhs_stride_z;
1117#endif // defined(MATRIX_B_DEPTH)
1118
Usama Arif0681e3b2019-04-25 14:28:07 +01001119 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001120 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001121
1122#if defined(REINTERPRET_INPUT_AS_3D)
Usama Arif0681e3b2019-04-25 14:28:07 +01001123 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1124 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001125
1126 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1127 // multiply lhs_stride_z by DEPTH_GEMM3D
1128 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1129
1130#else // defined(REINTERPRET_INPUT_AS_3D)
1131
1132 // Add offset for batched GEMM
1133 lhs_offset += z * lhs_stride_z;
1134
1135#endif // defined(REINTERPRET_INPUT_AS_3D)
1136
1137 // Initialize the accumulators
1138 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1139
1140 int i = 0;
1141 for(; i <= (K - K0); i += K0)
1142 {
1143 // Supported cases (M0, K0):
1144 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1145 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1146 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1147 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1148 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1149 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1150 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1151 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1152 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001153 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001154
Sheri Zhang1a378102020-04-30 12:59:39 +01001155 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001156 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001157
1158 // Accumulate
1159 ARM_DOT_K0XN0(K0, a0, b, c0);
1160#if M0 > 1
1161 ARM_DOT_K0XN0(K0, a1, b, c1);
1162#endif // M0 > 1
1163#if M0 > 2
1164 ARM_DOT_K0XN0(K0, a2, b, c2);
1165#endif // M0 > 2
1166#if M0 > 3
1167 ARM_DOT_K0XN0(K0, a3, b, c3);
1168#endif // M0 > 3
1169#if M0 > 4
1170 ARM_DOT_K0XN0(K0, a4, b, c4);
1171#endif // M0 > 4
1172#if M0 > 5
1173 ARM_DOT_K0XN0(K0, a5, b, c5);
1174#endif // M0 > 5
1175#if M0 > 6
1176 ARM_DOT_K0XN0(K0, a6, b, c6);
1177#endif // M0 > 6
1178#if M0 > 7
1179 ARM_DOT_K0XN0(K0, a7, b, c7);
1180#endif // M0 > 7
1181
1182 lhs_offset += K0 * sizeof(DATA_TYPE);
1183 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
1184 }
1185
1186 // Left-over accumulations
1187 for(; i < K; ++i)
1188 {
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001189 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001190 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001191
Sheri Zhang1a378102020-04-30 12:59:39 +01001192 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001193 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001194
1195 // Accumulate
1196 ARM_DOT_K0XN0(1, a0, b, c0);
1197#if M0 > 1
1198 ARM_DOT_K0XN0(1, a1, b, c1);
1199#endif // M0 > 1
1200#if M0 > 2
1201 ARM_DOT_K0XN0(1, a2, b, c2);
1202#endif // M0 > 2
1203#if M0 > 3
1204 ARM_DOT_K0XN0(1, a3, b, c3);
1205#endif // M0 > 3
1206#if M0 > 4
1207 ARM_DOT_K0XN0(1, a4, b, c4);
1208#endif // M0 > 4
1209#if M0 > 5
1210 ARM_DOT_K0XN0(1, a5, b, c5);
1211#endif // M0 > 5
1212#if M0 > 6
1213 ARM_DOT_K0XN0(1, a6, b, c6);
1214#endif // M0 > 6
1215#if M0 > 7
1216 ARM_DOT_K0XN0(1, a7, b, c7);
1217#endif // M0 > 7
1218
1219 lhs_offset += sizeof(DATA_TYPE);
1220 rhs_offset += sizeof(DATA_TYPE);
1221 }
1222
SiCong Li406a13f2020-07-15 12:09:58 +01001223 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001224
1225 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1226
1227#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001228
1229 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001230 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001231
1232 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1233 // multiply dst_stride_z by DEPTH_GEMM3D
1234 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1235
1236#else // defined(REINTERPRET_OUTPUT_AS_3D)
1237
1238 // Add offset for batched GEMM
1239 dst_addr += z * dst_stride_z;
1240
1241#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1242
1243 // Multiply by the weight of matrix-matrix product and store the result
1244#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001245 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001246#endif // defined(ALPHA)
1247
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001248 // Add beta*bias
1249#if defined(BETA)
1250#if defined(BROADCAST_BIAS)
1251 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1252
1253 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1254
1255#ifndef UNIT_BETA
1256 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1257#endif // UNIT_BIAS
1258
1259 // c = c + bias[broadcasted]
1260 ADD_BLOCK_BROADCAST(M0, c, bias0);
1261
1262#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001263 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001264
1265 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1266
1267#ifndef UNIT_BETA
1268 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1269#endif // UNIT_BIAS
1270
1271 // c = c + bias
1272 ADD_BLOCK(M0, c, bias);
1273
1274#endif // defined(BROADCAST_BIAS)
1275#endif // defined(BETA)
1276
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001277#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001278 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001279#endif // defined(ACTIVATION_TYPE)
1280
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001281 const bool cond_y = y == 0;
1282 const bool cond_x = ((x + 1) * N0 >= N);
1283
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001284 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001285 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001286
1287#undef RHS_BLOCK_SIZE
1288#undef RHS_OFFSET_X
1289#undef RHS_STEP_X
1290}
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001291
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001292#if defined(OPENCL_IMAGE_SUPPORT)
1293/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
1294 * The LHS matrix is NOT reshaped
1295 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1296 *
1297 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
1298 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1299 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1300 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
1301 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
1302 * could be different from the value returned by get_image_height(rhs_img).
1303 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1304 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1305 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
1306 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001307 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1308 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001309 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1310 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1311 * - N0 = 4, 8, 16
1312 * - K0 = 4, 8, 16
1313 * - H0 >= 1
1314 *
1315 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
1316 * The activation function is performed after the bias addition
1317 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1318 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1319 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1320 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1321 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1322 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1323 *
1324 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
1325 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
1326 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1327 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
1328 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1329 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
1330 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
1331 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1332 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1333 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1334 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1335 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1336 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1337 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1338 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1339 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1340 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1341 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1342 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1343 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
1344 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1345 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
1346 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1347 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1348 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1349 */
1350__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
1351 __read_only image2d_t rhs_img,
1352#if defined(BETA)
1353 IMAGE_DECLARATION(bias),
1354#endif // defined(BETA)
1355 IMAGE_DECLARATION(dst),
1356 uint lhs_stride_z,
1357 uint rhs_stride_z,
1358#if defined(BETA)
1359 uint bias_stride_z,
1360#endif //defined(BETA)
1361 uint dst_stride_z
1362#if defined(REINTERPRET_INPUT_AS_3D)
1363 ,
1364 uint lhs_cross_plane_pad
1365#endif // REINTERPRET_INPUT_AS_3D
1366#if defined(REINTERPRET_OUTPUT_AS_3D)
1367 ,
1368 uint dst_cross_plane_pad
1369#endif // REINTERPRET_OUTPUT_AS_3D
1370 )
1371{
1372 // Pixel unit
1373#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
1374
1375#define LEFTOVER_K (K % K0)
1376
1377 // Block size
1378#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
1379
1380 // RHS offset and step X
1381#if defined(RHS_INTERLEAVE)
1382#define RHS_OFFSET_X (PIXEL_UNIT)
1383#define RHS_STEP_X (PIXEL_UNIT * (H0))
1384#define RHS_STEP_LOOP (1)
1385#else // defined(RHS_INTERLEAVE)
1386#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1387#define RHS_STEP_X PIXEL_UNIT
1388#define RHS_STEP_LOOP (H0)
1389#endif // defined(RHS_INTERLEAVE)
1390
1391 uint x = get_global_id(0);
1392 uint y = get_global_id(1);
1393 uint z = get_global_id(2);
1394
1395#if defined(DUMMY_WORK_ITEMS)
1396 if((x * N0 >= N) || (y * M0 >= M))
1397 {
1398 return;
1399 }
1400#endif // defined(DUMMY_WORK_ITEMS)
1401
1402 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001403 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001404
1405#if defined(MATRIX_B_DEPTH)
1406 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1407 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
1408#else // defined(MATRIX_B_DEPTH)
1409 const uint z_rhs = get_global_id(2);
1410#endif // defined(MATRIX_B_DEPTH)
1411
1412 // Compute RHS matrix coordinates
1413 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
1414 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
1415
1416 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
1417 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
1418
1419#if defined(REINTERPRET_INPUT_AS_3D)
1420 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1421 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
1422
1423 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1424 // multiply lhs_stride_z by DEPTH_GEMM3D
1425 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1426
1427#else // defined(REINTERPRET_INPUT_AS_3D)
1428
1429 // Add offset for batched GEMM
1430 lhs_offset += z * lhs_stride_z;
1431
1432#endif // defined(REINTERPRET_INPUT_AS_3D)
1433
1434 // Initialize the accumulators
1435 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
1436
1437 int i = 0;
1438 for(; i <= (K - K0); i += K0)
1439 {
1440 // Load values from LHS matrix
1441 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1442
1443 // Load values from RHS matrix stored in a cl_image
1444 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1445 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1446
1447 // Accumulate
1448 ARM_DOT_K0XN0(K0, a0, b, c0);
1449#if M0 > 1
1450 ARM_DOT_K0XN0(K0, a1, b, c1);
1451#endif // M0 > 1
1452#if M0 > 2
1453 ARM_DOT_K0XN0(K0, a2, b, c2);
1454#endif // M0 > 2
1455#if M0 > 3
1456 ARM_DOT_K0XN0(K0, a3, b, c3);
1457#endif // M0 > 3
1458#if M0 > 4
1459 ARM_DOT_K0XN0(K0, a4, b, c4);
1460#endif // M0 > 4
1461#if M0 > 5
1462 ARM_DOT_K0XN0(K0, a5, b, c5);
1463#endif // M0 > 5
1464#if M0 > 6
1465 ARM_DOT_K0XN0(K0, a6, b, c6);
1466#endif // M0 > 6
1467#if M0 > 7
1468 ARM_DOT_K0XN0(K0, a7, b, c7);
1469#endif // M0 > 7
1470
1471 lhs_offset += K0 * sizeof(DATA_TYPE);
1472 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1473 }
1474
1475#if LEFTOVER_K != 0
1476 // Note: We cannot read out-of-bound elements from the RHS matrix because
1477 // the RHS width is always multiple of K0. This is not be true for the LHS matrix
1478
1479 union UNION_VEC_TYPE
1480 {
1481 DATA_TYPE s[K0];
1482 VEC_DATA_TYPE(DATA_TYPE, K0)
1483 v;
1484 };
1485
1486 union UNION_VEC_TYPE a0 = {.v = 0 };
1487#if M0 > 1
1488 union UNION_VEC_TYPE a1 = {.v = 0 };
1489#endif // M0 > 1
1490#if M0 > 2
1491 union UNION_VEC_TYPE a2 = {.v = 0 };
1492#endif // M0 > 2
1493#if M0 > 3
1494 union UNION_VEC_TYPE a3 = {.v = 0 };
1495#endif // M0 > 3
1496#if M0 > 4
1497 union UNION_VEC_TYPE a4 = {.v = 0 };
1498#endif // M0 > 4
1499#if M0 > 5
1500 union UNION_VEC_TYPE a5 = {.v = 0 };
1501#endif // M0 > 5
1502#if M0 > 6
1503 union UNION_VEC_TYPE a6 = {.v = 0 };
1504#endif // M0 > 6
1505#if M0 > 7
1506 union UNION_VEC_TYPE a7 = {.v = 0 };
1507#endif // M0 > 7
1508
1509 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1510
1511 // Load from RHS matrix
1512 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1513
1514 // Load from LHS matrix
1515 for(int k = 0; k < LEFTOVER_K; ++k)
1516 {
1517 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
1518#if M0 > 1
1519 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
1520#endif // M0 > 1
1521#if M0 > 2
1522 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
1523#endif // M0 > 2
1524#if M0 > 3
1525 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
1526#endif // M0 > 3
1527#if M0 > 4
1528 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
1529#endif // M0 > 4
1530#if M0 > 5
1531 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
1532#endif // M0 > 5
1533#if M0 > 6
1534 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
1535#endif // M0 > 6
1536#if M0 > 7
1537 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
1538#endif // M0 > 7
1539
1540 lhs_offset += sizeof(DATA_TYPE);
1541 }
1542
1543 // Accumulate
1544 ARM_DOT_K0XN0(K0, a0.v, b, c0);
1545#if M0 > 1
1546 ARM_DOT_K0XN0(K0, a1.v, b, c1);
1547#endif // M0 > 1
1548#if M0 > 2
1549 ARM_DOT_K0XN0(K0, a2.v, b, c2);
1550#endif // M0 > 2
1551#if M0 > 3
1552 ARM_DOT_K0XN0(K0, a3.v, b, c3);
1553#endif // M0 > 3
1554#if M0 > 4
1555 ARM_DOT_K0XN0(K0, a4.v, b, c4);
1556#endif // M0 > 4
1557#if M0 > 5
1558 ARM_DOT_K0XN0(K0, a5.v, b, c5);
1559#endif // M0 > 5
1560#if M0 > 6
1561 ARM_DOT_K0XN0(K0, a6.v, b, c6);
1562#endif // M0 > 6
1563#if M0 > 7
1564 ARM_DOT_K0XN0(K0, a7.v, b, c7);
1565#endif // M0 > 7
1566
1567#endif // LEFTOVER_K != 0
1568
SiCong Li406a13f2020-07-15 12:09:58 +01001569 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001570
1571 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1572
1573#if defined(REINTERPRET_OUTPUT_AS_3D)
1574
1575 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1576 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
1577
1578 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1579 // multiply dst_stride_z by DEPTH_GEMM3D
1580 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1581
1582#else // defined(REINTERPRET_OUTPUT_AS_3D)
1583
1584 // Add offset for batched GEMM
1585 dst_addr += z * dst_stride_z;
1586
1587#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1588
1589 // Multiply by the weight of matrix-matrix product and store the result
1590#if defined(ALPHA)
1591 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
1592#endif // defined(ALPHA)
1593
1594 // Add beta*bias
1595#if defined(BETA)
1596#if defined(BROADCAST_BIAS)
1597 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1598
1599 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1600
1601#ifndef UNIT_BETA
1602 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1603#endif // UNIT_BIAS
1604
1605 // c = c + bias[broadcasted]
1606 ADD_BLOCK_BROADCAST(M0, c, bias0);
1607
1608#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001609 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001610
1611 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1612
1613#ifndef UNIT_BETA
1614 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1615#endif // UNIT_BIAS
1616
1617 // c = c + bias
1618 ADD_BLOCK(M0, c, bias);
1619
1620#endif // defined(BROADCAST_BIAS)
1621#endif // defined(BETA)
1622
1623#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001624 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001625#endif // defined(ACTIVATION_TYPE)
1626
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001627 const bool cond_y = y == 0;
1628 const bool cond_x = ((x + 1) * N0 >= N);
1629
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001630 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001631 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001632
1633#undef RHS_BLOCK_SIZE
1634#undef RHS_OFFSET_X
1635#undef RHS_STEP_X
1636#undef LEFTOVER_K
1637#undef PIXEL_UNIT
1638}
1639#endif // defined(OPENCL_IMAGE_SUPPORT)
1640
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001641#define VFMA(a, b, c) \
1642 ({ \
1643 c = fma(a, b, c); \
1644 })
1645
1646#if M0 == 1
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001647#define VFMA_M0xN0(i, a, b, c) \
1648 ({ \
1649 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001650 })
1651#elif M0 == 2 // M0 == 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001652#define VFMA_M0xN0(i, a, b, c) \
1653 ({ \
1654 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1655 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001656 })
1657#elif M0 == 3 // M0 == 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001658#define VFMA_M0xN0(i, a, b, c) \
1659 ({ \
1660 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1661 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1662 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001663 })
1664#elif M0 == 4 // M0 == 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001665#define VFMA_M0xN0(i, a, b, c) \
1666 ({ \
1667 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1668 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1669 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1670 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001671 })
1672#elif M0 == 5 // M0 == 5
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001673#define VFMA_M0xN0(i, a, b, c) \
1674 ({ \
1675 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1676 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1677 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1678 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1679 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001680 })
1681#elif M0 == 6 // M0 == 6
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001682#define VFMA_M0xN0(i, a, b, c) \
1683 ({ \
1684 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1685 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1686 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1687 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1688 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1689 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001690 })
1691#elif M0 == 7 // M0 == 7
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001692#define VFMA_M0xN0(i, a, b, c) \
1693 ({ \
1694 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1695 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1696 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1697 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1698 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1699 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1700 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001701 })
1702#elif M0 == 8 // M0 == 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001703#define VFMA_M0xN0(i, a, b, c) \
1704 ({ \
1705 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1706 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1707 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1708 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1709 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1710 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1711 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
1712 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001713 })
1714#else // M0 not supported
1715#error "M0 not supported"
1716#endif // M0 not supported
1717
1718/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1719 * The LHS matrix is NOT reshaped
1720 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
1721 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001722 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001723 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
1724 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1725 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1726 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001727 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001728 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1729 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001730 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1731 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1732 * - N0 = 2, 3, 4, 8, 16
1733 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001734 * - H0 >= 1
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001735 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001736 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001737 * The activation function is performed after the bias addition
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001738 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1739 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1740 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1741 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1742 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1743 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1744 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001745 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1746 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001747 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001748 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001749 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001750 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001751 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1752 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1753 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1754 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1755 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1756 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001757 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1758 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001759 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001760 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001761 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1762 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1763 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1764 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1765 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1766 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1767 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1768 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001769 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001770 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001771 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001772 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1773 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1774 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001775 */
1776__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
1777 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001778#if defined(BETA)
1779 IMAGE_DECLARATION(bias),
1780#endif // defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001781 IMAGE_DECLARATION(dst),
1782 uint lhs_stride_z,
1783 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001784#if defined(BETA)
1785 uint bias_stride_z,
1786#endif //defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001787 uint dst_stride_z
1788#if defined(REINTERPRET_INPUT_AS_3D)
1789 ,
1790 uint lhs_cross_plane_pad
1791#endif // REINTERPRET_INPUT_AS_3D
1792#if defined(REINTERPRET_OUTPUT_AS_3D)
1793 ,
1794 uint dst_cross_plane_pad
1795#endif // REINTERPRET_OUTPUT_AS_3D
1796 )
1797{
1798 // Block size
1799#define RHS_BLOCK_SIZE ((K0) * (N0))
1800
1801 // RHS offset and step X
1802#if defined(RHS_INTERLEAVE)
1803#define RHS_OFFSET_X (N0)
1804#define RHS_STEP_X ((N0) * (H0))
1805#define RHS_STEP_LOOP (1)
1806#else // defined(RHS_INTERLEAVE)
1807#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1808#define RHS_STEP_X (N0)
1809#define RHS_STEP_LOOP (H0)
1810#endif // defined(RHS_INTERLEAVE)
1811
1812 uint x = get_global_id(0);
1813 uint y = get_global_id(1);
1814 uint z = get_global_id(2);
1815
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001816#if defined(DUMMY_WORK_ITEMS)
1817 if((x * N0 >= N) || (y * M0 >= M))
1818 {
1819 return;
1820 }
1821#endif // defined(DUMMY_WORK_ITEMS)
1822
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001823 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001824 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001825
Sheri Zhang1a378102020-04-30 12:59:39 +01001826 // Compute RHS reshaped matrix address
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001827 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1828
1829#if defined(MATRIX_B_DEPTH)
1830 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1831 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1832#else // defined(MATRIX_B_DEPTH)
1833 rhs_offset += z * rhs_stride_z;
1834#endif // defined(MATRIX_B_DEPTH)
1835
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001836 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
1837 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001838
1839#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001840
1841 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001842 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001843
1844 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1845 // multiply lhs_stride_z by DEPTH_GEMM3D
1846 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1847
1848#else // defined(REINTERPRET_INPUT_AS_3D)
1849
1850 // Add offset for batched GEMM
1851 lhs_offset += z * lhs_stride_z;
1852
1853#endif // defined(REINTERPRET_INPUT_AS_3D)
1854
1855 // Initialize the accumulators
1856 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1857
1858 int i = 0;
1859 for(; i <= (K - K0); i += K0)
1860 {
1861 // Supported cases (M0, K0):
1862 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1863 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1864 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1865 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1866 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1867 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1868 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1869 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1870 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001871 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001872
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001873 VEC_DATA_TYPE(DATA_TYPE, N0)
1874 b0;
1875
1876 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1877 VFMA_M0xN0(0, a, b0, c);
1878 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
1879 VFMA_M0xN0(1, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001880#if K0 > 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001881 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
1882 VFMA_M0xN0(2, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001883#endif // K0 > 2
1884#if K0 > 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001885 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
1886 VFMA_M0xN0(3, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001887#endif // K0 > 3
1888#if K0 > 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001889 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
1890 VFMA_M0xN0(4, a, b0, c);
1891 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
1892 VFMA_M0xN0(5, a, b0, c);
1893 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
1894 VFMA_M0xN0(6, a, b0, c);
1895 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
1896 VFMA_M0xN0(7, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001897#endif // K0 > 4
1898#if K0 > 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001899 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
1900 VFMA_M0xN0(8, a, b0, c);
1901 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
1902 VFMA_M0xN0(9, a, b0, c);
1903 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
1904 VFMA_M0xN0(A, a, b0, c);
1905 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
1906 VFMA_M0xN0(B, a, b0, c);
1907 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
1908 VFMA_M0xN0(C, a, b0, c);
1909 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
1910 VFMA_M0xN0(D, a, b0, c);
1911 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
1912 VFMA_M0xN0(E, a, b0, c);
1913 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
1914 VFMA_M0xN0(F, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001915#endif // K0 > 8
1916
1917 lhs_offset += K0 * sizeof(DATA_TYPE);
1918 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
1919 }
1920
1921 // Left-over accumulations
1922 for(; i < K; ++i)
1923 {
1924 // Load values from LHS matrix
1925 VEC_DATA_TYPE(DATA_TYPE, 2)
1926 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
1927#if M0 > 1
1928 VEC_DATA_TYPE(DATA_TYPE, 2)
1929 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
1930#endif // M0 > 1
1931#if M0 > 2
1932 VEC_DATA_TYPE(DATA_TYPE, 2)
1933 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
1934#endif // M0 > 2
1935#if M0 > 3
1936 VEC_DATA_TYPE(DATA_TYPE, 2)
1937 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
1938#endif // M0 > 3
1939#if M0 > 4
1940 VEC_DATA_TYPE(DATA_TYPE, 2)
1941 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
1942#endif // M0 > 4
1943#if M0 > 5
1944 VEC_DATA_TYPE(DATA_TYPE, 2)
1945 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
1946#endif // M0 > 5
1947#if M0 > 6
1948 VEC_DATA_TYPE(DATA_TYPE, 2)
1949 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
1950#endif // M0 > 6
1951#if M0 > 7
1952 VEC_DATA_TYPE(DATA_TYPE, 2)
giuros01b3204e72019-04-01 13:50:22 +01001953 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001954#endif // M0 > 7
1955
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001956 VEC_DATA_TYPE(DATA_TYPE, N0)
1957 b0;
1958
1959 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1960 VFMA_M0xN0(0, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001961
1962 lhs_offset += sizeof(DATA_TYPE);
1963 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
1964 }
1965
SiCong Li406a13f2020-07-15 12:09:58 +01001966 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001967
1968 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1969
1970#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001971 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001972 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001973
1974 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1975 // multiply dst_stride_z by DEPTH_GEMM3D
1976 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1977
1978#else // defined(REINTERPRET_OUTPUT_AS_3D)
1979
1980 // Add offset for batched GEMM
1981 dst_addr += z * dst_stride_z;
1982
1983#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1984
1985 // Multiply by the weight of matrix-matrix product and store the result
1986#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001987 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001988#endif // defined(ALPHA)
1989
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001990 // Add beta*bias
1991#if defined(BETA)
1992#if defined(BROADCAST_BIAS)
1993 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1994
1995 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1996
1997#ifndef UNIT_BETA
1998 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1999#endif // UNIT_BIAS
2000
2001 // c = c + bias[broadcasted]
2002 ADD_BLOCK_BROADCAST(M0, c, bias0);
2003
2004#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002005 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01002006
2007 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2008
2009#ifndef UNIT_BETA
2010 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2011#endif // UNIT_BIAS
2012
2013 // c = c + bias
2014 ADD_BLOCK(M0, c, bias);
2015
2016#endif // defined(BROADCAST_BIAS)
2017#endif // defined(BETA)
2018
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002019#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002020 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002021#endif // defined(ACTIVATION_TYPE)
2022
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002023 const bool cond_y = y == 0;
2024 const bool cond_x = ((x + 1) * N0 >= N);
2025
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002026 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002027 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002028
2029#undef RHS_BLOCK_SIZE
2030#undef RHS_OFFSET_X
2031#undef RHS_STEP_X
2032}
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002033
2034#if defined(OPENCL_IMAGE_SUPPORT)
2035/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2036 * The LHS matrix is NOT reshaped
2037 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
2038 *
2039 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2040 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2041 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
2042 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2043 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2044 * could be different from the value returned by get_image_height(rhs_img).
2045 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
2046 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
2047 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2048 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01002049 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2050 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002051 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2052 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
2053 * - N0 = 4, 8, 16
2054 * - K0 = 4, 8, 16
2055 * - H0 >= 1
2056 *
2057 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2058 * The activation function is performed after the bias addition
2059 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
2060 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
2061 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2062 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2063 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2064 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
2065 *
2066 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
2067 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
2068 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2069 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
2070 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2071 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
2072 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2073 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2074 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2075 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2076 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2077 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2078 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2079 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2080 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2081 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2082 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2083 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2084 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2085 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
2086 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2087 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2088 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2089 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
2090 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2091 */
2092__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
2093 __read_only image2d_t rhs_img,
2094#if defined(BETA)
2095 IMAGE_DECLARATION(bias),
2096#endif // defined(BETA)
2097 IMAGE_DECLARATION(dst),
2098 uint lhs_stride_z,
2099 uint rhs_stride_z,
2100#if defined(BETA)
2101 uint bias_stride_z,
2102#endif //defined(BETA)
2103 uint dst_stride_z
2104#if defined(REINTERPRET_INPUT_AS_3D)
2105 ,
2106 uint lhs_cross_plane_pad
2107#endif // REINTERPRET_INPUT_AS_3D
2108#if defined(REINTERPRET_OUTPUT_AS_3D)
2109 ,
2110 uint dst_cross_plane_pad
2111#endif // REINTERPRET_OUTPUT_AS_3D
2112 )
2113{
2114 // Pixel unit
2115#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
2116
2117 // Block size
2118#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
2119
2120 // RHS offset and step X
2121#if defined(RHS_INTERLEAVE)
2122#define RHS_OFFSET_X (PIXEL_UNIT)
2123#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
2124#else // defined(RHS_INTERLEAVE)
2125#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2126#define RHS_STEP_X (PIXEL_UNIT)
2127#endif // defined(RHS_INTERLEAVE)
2128
2129 uint x = get_global_id(0);
2130 uint y = get_global_id(1);
2131 uint z = get_global_id(2);
2132
2133#if defined(DUMMY_WORK_ITEMS)
2134 if((x * N0 >= N) || (y * M0 >= M))
2135 {
2136 return;
2137 }
2138#endif // defined(DUMMY_WORK_ITEMS)
2139
2140 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01002141 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002142
2143#if defined(MATRIX_B_DEPTH)
2144 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2145 const uint z_rhs = (z % MATRIX_B_DEPTH);
2146#else // defined(MATRIX_B_DEPTH)
2147 const uint z_rhs = z;
2148#endif // defined(MATRIX_B_DEPTH)
2149
2150 // Compute RHS matrix coordinates
2151 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
2152 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
2153
2154 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
2155 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2156
2157#if defined(REINTERPRET_INPUT_AS_3D)
2158
2159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2160 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
2161
2162 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2163 // multiply lhs_stride_z by DEPTH_GEMM3D
2164 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
2165
2166#else // defined(REINTERPRET_INPUT_AS_3D)
2167
2168 // Add offset for batched GEMM
2169 lhs_offset += z * lhs_stride_z;
2170
2171#endif // defined(REINTERPRET_INPUT_AS_3D)
2172
2173 // Initialize the accumulators
2174 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
2175
2176 int i = 0;
2177 for(; i <= (K - K0); i += K0)
2178 {
2179 // Load values from LHS matrix
2180 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
2181
2182 VEC_DATA_TYPE(DATA_TYPE, N0)
2183 b0;
2184
2185 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2186 VFMA_M0xN0(0, a, b0, c);
2187 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
2188 VFMA_M0xN0(1, a, b0, c);
2189#if K0 > 2
2190 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
2191 VFMA_M0xN0(2, a, b0, c);
2192#endif // K0 > 2
2193#if K0 > 3
2194 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
2195 VFMA_M0xN0(3, a, b0, c);
2196#endif // K0 > 3
2197#if K0 > 4
2198 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
2199 VFMA_M0xN0(4, a, b0, c);
2200 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
2201 VFMA_M0xN0(5, a, b0, c);
2202 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
2203 VFMA_M0xN0(6, a, b0, c);
2204 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
2205 VFMA_M0xN0(7, a, b0, c);
2206#endif // K0 > 4
2207#if K0 > 8
2208 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
2209 VFMA_M0xN0(8, a, b0, c);
2210 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
2211 VFMA_M0xN0(9, a, b0, c);
2212 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
2213 VFMA_M0xN0(A, a, b0, c);
2214 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
2215 VFMA_M0xN0(B, a, b0, c);
2216 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
2217 VFMA_M0xN0(C, a, b0, c);
2218 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
2219 VFMA_M0xN0(D, a, b0, c);
2220 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
2221 VFMA_M0xN0(E, a, b0, c);
2222 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
2223 VFMA_M0xN0(F, a, b0, c);
2224#endif // K0 > 8
2225
2226 lhs_offset += K0 * sizeof(DATA_TYPE);
2227 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
2228 }
2229
2230 // Left-over accumulations
2231 for(; i < K; ++i)
2232 {
2233 // Load values from LHS matrix
2234 VEC_DATA_TYPE(DATA_TYPE, 2)
2235 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
2236#if M0 > 1
2237 VEC_DATA_TYPE(DATA_TYPE, 2)
2238 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
2239#endif // M0 > 1
2240#if M0 > 2
2241 VEC_DATA_TYPE(DATA_TYPE, 2)
2242 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
2243#endif // M0 > 2
2244#if M0 > 3
2245 VEC_DATA_TYPE(DATA_TYPE, 2)
2246 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
2247#endif // M0 > 3
2248#if M0 > 4
2249 VEC_DATA_TYPE(DATA_TYPE, 2)
2250 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
2251#endif // M0 > 4
2252#if M0 > 5
2253 VEC_DATA_TYPE(DATA_TYPE, 2)
2254 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
2255#endif // M0 > 5
2256#if M0 > 6
2257 VEC_DATA_TYPE(DATA_TYPE, 2)
2258 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
2259#endif // M0 > 6
2260#if M0 > 7
2261 VEC_DATA_TYPE(DATA_TYPE, 2)
2262 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
2263#endif // M0 > 7
2264
2265 VEC_DATA_TYPE(DATA_TYPE, N0)
2266 b0;
2267 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2268
2269 VFMA_M0xN0(0, a, b0, c);
2270
2271 lhs_offset += sizeof(DATA_TYPE);
2272 x_rhs += RHS_STEP_X;
2273 }
2274
SiCong Li406a13f2020-07-15 12:09:58 +01002275 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002276
2277 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
2278
2279#if defined(REINTERPRET_OUTPUT_AS_3D)
2280 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2281 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2282
2283 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2284 // multiply dst_stride_z by DEPTH_GEMM3D
2285 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2286
2287#else // defined(REINTERPRET_OUTPUT_AS_3D)
2288
2289 // Add offset for batched GEMM
2290 dst_addr += z * dst_stride_z;
2291
2292#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2293
2294 // Multiply by the weight of matrix-matrix product and store the result
2295#if defined(ALPHA)
2296 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2297#endif // defined(ALPHA)
2298
2299 // Add beta*bias
2300#if defined(BETA)
2301#if defined(BROADCAST_BIAS)
2302 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2303
2304 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2305
2306#ifndef UNIT_BETA
2307 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2308#endif // UNIT_BIAS
2309
2310 // c = c + bias[broadcasted]
2311 ADD_BLOCK_BROADCAST(M0, c, bias0);
2312
2313#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002314 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002315
2316 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2317
2318#ifndef UNIT_BETA
2319 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2320#endif // UNIT_BIAS
2321
2322 // c = c + bias
2323 ADD_BLOCK(M0, c, bias);
2324
2325#endif // defined(BROADCAST_BIAS)
2326#endif // defined(BETA)
2327
2328#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002329 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002330#endif // defined(ACTIVATION_TYPE)
2331
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002332 const bool cond_y = y == 0;
2333 const bool cond_x = ((x + 1) * N0 >= N);
2334
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002335 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002336 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002337
2338#undef RHS_BLOCK_SIZE
2339#undef RHS_OFFSET_X
2340#undef RHS_STEP_X
2341}
2342#endif // defined(OPENCL_IMAGE_SUPPORT)
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002343#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002344
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002345#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002346
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002347#if defined(MIXED_PRECISION)
2348#if K0 == 2
2349#define ARM_DOT_K0(a, b, c) \
2350 ({ \
2351 c += a.s0 * b.s0; \
2352 c += a.s1 * b.s1; \
2353 })
2354#elif K0 == 3 // K0 == 3
2355#define ARM_DOT_K0(a, b, c) \
2356 ({ \
2357 c += a.s0 * b.s0; \
2358 c += a.s1 * b.s1; \
2359 c += a.s2 * b.s2; \
2360 })
2361#elif K0 == 4 // K0 == 4
2362#define ARM_DOT_K0(a, b, c) \
2363 ({ \
2364 c += a.s0 * b.s0; \
2365 c += a.s1 * b.s1; \
2366 c += a.s2 * b.s2; \
2367 c += a.s3 * b.s3; \
2368 })
2369#elif K0 == 8 // K0 == 8
2370#define ARM_DOT_K0(a, b, c) \
2371 ({ \
2372 c += a.s0 * b.s0; \
2373 c += a.s1 * b.s1; \
2374 c += a.s2 * b.s2; \
2375 c += a.s3 * b.s3; \
2376 c += a.s4 * b.s4; \
2377 c += a.s5 * b.s5; \
2378 c += a.s6 * b.s6; \
2379 c += a.s7 * b.s7; \
2380 })
2381#elif K0 == 16 // K0 == 16
2382#define ARM_DOT_K0(a, b, c) \
2383 ({ \
2384 c += a.s0 * b.s0; \
2385 c += a.s1 * b.s1; \
2386 c += a.s2 * b.s2; \
2387 c += a.s3 * b.s3; \
2388 c += a.s4 * b.s4; \
2389 c += a.s5 * b.s5; \
2390 c += a.s6 * b.s6; \
2391 c += a.s7 * b.s7; \
2392 c += a.s8 * b.s8; \
2393 c += a.s9 * b.s9; \
2394 c += a.sA * b.sA; \
2395 c += a.sB * b.sB; \
2396 c += a.sC * b.sC; \
2397 c += a.sD * b.sD; \
2398 c += a.sE * b.sE; \
2399 c += a.sF * b.sF; \
2400 })
2401#else // K0 not supported
2402#error "K0 value not supported"
2403#endif // K0 conditions
2404#else // defined(MIXED_PRECISION)
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002405#if K0 == 2
2406#define ARM_DOT_K0(a, b, c) \
2407 ({ \
2408 c = fma(a.s0, b.s0, c); \
2409 c = fma(a.s1, b.s1, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002410 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002411#elif K0 == 3 // K0 == 3
2412#define ARM_DOT_K0(a, b, c) \
2413 ({ \
2414 c = fma(a.s0, b.s0, c); \
2415 c = fma(a.s1, b.s1, c); \
2416 c = fma(a.s2, b.s2, c); \
2417 })
2418#elif K0 == 4 // K0 == 4
2419#define ARM_DOT_K0(a, b, c) \
2420 ({ \
2421 c = fma(a.s0, b.s0, c); \
2422 c = fma(a.s1, b.s1, c); \
2423 c = fma(a.s2, b.s2, c); \
2424 c = fma(a.s3, b.s3, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002425 })
2426#elif K0 == 8 // K0 == 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002427#define ARM_DOT_K0(a, b, c) \
2428 ({ \
2429 c = fma(a.s0, b.s0, c); \
2430 c = fma(a.s1, b.s1, c); \
2431 c = fma(a.s2, b.s2, c); \
2432 c = fma(a.s3, b.s3, c); \
2433 c = fma(a.s4, b.s4, c); \
2434 c = fma(a.s5, b.s5, c); \
2435 c = fma(a.s6, b.s6, c); \
2436 c = fma(a.s7, b.s7, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002437 })
2438#elif K0 == 16 // K0 == 16
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002439#define ARM_DOT_K0(a, b, c) \
2440 ({ \
2441 c = fma(a.s0, b.s0, c); \
2442 c = fma(a.s1, b.s1, c); \
2443 c = fma(a.s2, b.s2, c); \
2444 c = fma(a.s3, b.s3, c); \
2445 c = fma(a.s4, b.s4, c); \
2446 c = fma(a.s5, b.s5, c); \
2447 c = fma(a.s6, b.s6, c); \
2448 c = fma(a.s7, b.s7, c); \
2449 c = fma(a.s8, b.s8, c); \
2450 c = fma(a.s9, b.s9, c); \
2451 c = fma(a.sA, b.sA, c); \
2452 c = fma(a.sB, b.sB, c); \
2453 c = fma(a.sC, b.sC, c); \
2454 c = fma(a.sD, b.sD, c); \
2455 c = fma(a.sE, b.sE, c); \
2456 c = fma(a.sF, b.sF, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002457 })
2458#else // K0 not supported
2459#error "K0 value not supported"
2460#endif // K0 conditions
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002461#endif // defined(MIXED_PRECISION)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002462
2463#if N0 == 2
2464#define ARM_DOT_K0XN0(a, b, c) \
2465 ({ \
2466 ARM_DOT_K0((a), (b##0), (c.s0)); \
2467 ARM_DOT_K0((a), (b##1), (c.s1)); \
2468 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002469#elif N0 == 3 // N0 == 3
2470#define ARM_DOT_K0XN0(a, b, c) \
2471 ({ \
2472 ARM_DOT_K0((a), (b##0), (c.s0)); \
2473 ARM_DOT_K0((a), (b##1), (c.s1)); \
2474 ARM_DOT_K0((a), (b##2), (c.s2)); \
2475 })
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002476#elif N0 == 4 // N0 == 4
2477#define ARM_DOT_K0XN0(a, b, c) \
2478 ({ \
2479 ARM_DOT_K0((a), (b##0), (c.s0)); \
2480 ARM_DOT_K0((a), (b##1), (c.s1)); \
2481 ARM_DOT_K0((a), (b##2), (c.s2)); \
2482 ARM_DOT_K0((a), (b##3), (c.s3)); \
2483 })
2484#elif N0 == 8 // N0 == 8
2485#define ARM_DOT_K0XN0(a, b, c) \
2486 ({ \
2487 ARM_DOT_K0((a), (b##0), (c.s0)); \
2488 ARM_DOT_K0((a), (b##1), (c.s1)); \
2489 ARM_DOT_K0((a), (b##2), (c.s2)); \
2490 ARM_DOT_K0((a), (b##3), (c.s3)); \
2491 ARM_DOT_K0((a), (b##4), (c.s4)); \
2492 ARM_DOT_K0((a), (b##5), (c.s5)); \
2493 ARM_DOT_K0((a), (b##6), (c.s6)); \
2494 ARM_DOT_K0((a), (b##7), (c.s7)); \
2495 })
2496#elif N0 == 16 // N0 == 16
2497#define ARM_DOT_K0XN0(a, b, c) \
2498 ({ \
2499 ARM_DOT_K0((a), (b##0), (c.s0)); \
2500 ARM_DOT_K0((a), (b##1), (c.s1)); \
2501 ARM_DOT_K0((a), (b##2), (c.s2)); \
2502 ARM_DOT_K0((a), (b##3), (c.s3)); \
2503 ARM_DOT_K0((a), (b##4), (c.s4)); \
2504 ARM_DOT_K0((a), (b##5), (c.s5)); \
2505 ARM_DOT_K0((a), (b##6), (c.s6)); \
2506 ARM_DOT_K0((a), (b##7), (c.s7)); \
2507 ARM_DOT_K0((a), (b##8), (c.s8)); \
2508 ARM_DOT_K0((a), (b##9), (c.s9)); \
2509 ARM_DOT_K0((a), (b##A), (c.sA)); \
2510 ARM_DOT_K0((a), (b##B), (c.sB)); \
2511 ARM_DOT_K0((a), (b##C), (c.sC)); \
2512 ARM_DOT_K0((a), (b##D), (c.sD)); \
2513 ARM_DOT_K0((a), (b##E), (c.sE)); \
2514 ARM_DOT_K0((a), (b##F), (c.sF)); \
2515 })
2516#else // N0 not supported
2517#error "N0 value not supported"
2518#endif // N0 conditions
2519
2520/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2521 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2522 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2523 *
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002524 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2525 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2526 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002527 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002528 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002529 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2530 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2531 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002532 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2533 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002534 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2535 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002536 * @note Only the following configurations of M0, N0 and K0 are currently supported:
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01002537 * - M0 = 2, 3, 4, 5, 6, 7, 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002538 * - N0 = 2, 3, 4, 8, 16
2539 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00002540 * - V0 >= 1
2541 * - H0 >= 1
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002542 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002543 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002544 * The activation function is performed after the bias addition
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002545 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002546 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2547 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2548 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2549 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2550 *
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002551 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
2552 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2553 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2554 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2555 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2556 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2557 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
2558 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
2559 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2560 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
2561 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2562 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
2563 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2564 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2565 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2566 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2567 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2568 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2569 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2574 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002575 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002576 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2577 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2578 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2579 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2580 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002581 */
2582__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
2583 IMAGE_DECLARATION(rhs),
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002584#if defined(BETA)
2585 IMAGE_DECLARATION(bias),
2586#endif // defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002587 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002588 uint k,
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002589 uint lhs_stride_z,
2590 uint rhs_stride_z,
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002591#if defined(BETA)
2592 uint bias_stride_z,
2593#endif //defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002594 uint dst_stride_z
2595#if defined(REINTERPRET_OUTPUT_AS_3D)
2596 ,
2597 uint dst_cross_plane_pad
2598#endif // REINTERPRET_OUTPUT_AS_3D
2599 )
2600{
2601 // Block size
2602#define LHS_BLOCK_SIZE ((K0) * (M0))
2603
2604#if defined(LHS_INTERLEAVE)
2605#define LHS_OFFSET_X (K0)
2606#define LHS_STEP_X ((K0) * (V0))
2607#define LHS_STEP_LOOP (1)
2608#else // defined(INTERLEAVE)
2609#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2610#define LHS_STEP_X (K0)
2611#define LHS_STEP_LOOP (V0)
2612#endif // defined(INTERLEAVE)
2613
2614 // Block size
2615#define RHS_BLOCK_SIZE ((K0) * (N0))
2616
2617 // RHS offset and step X
2618#if defined(RHS_INTERLEAVE)
2619#define RHS_OFFSET_X (K0)
2620#define RHS_STEP_X ((K0) * (H0))
2621#define RHS_STEP_LOOP (1)
2622#else // defined(RHS_INTERLEAVE)
2623#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2624#define RHS_STEP_X (K0)
2625#define RHS_STEP_LOOP (H0)
2626#endif // defined(RHS_INTERLEAVE)
2627
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002628#if defined(DUMMY_WORK_ITEMS)
2629 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2630 {
2631 return;
2632 }
2633#endif // defined(DUMMY_WORK_ITEMS)
2634
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002635 // Compute LHS matrix address
2636 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2637 (get_global_id(2) * lhs_stride_z);
2638
2639 // Compute RHS matrix address
2640 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
2641
2642#if defined(MATRIX_B_DEPTH)
2643 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2644 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
2645#else // defined(MATRIX_B_DEPTH)
2646 rhs_addr += get_global_id(2) * rhs_stride_z;
2647#endif // defined(MATRIX_B_DEPTH)
2648
2649 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002650 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002651
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002652 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2653 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Usama Arif0681e3b2019-04-25 14:28:07 +01002654
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002655 for(int i = 0; i < k; i += K0)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002656 {
2657 // Supported cases (M0, K0):
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002658 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
2659 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
2660 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
2661 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
2662 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
2663 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
2664 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
2665 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002666 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01002667 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002668
2669 // Load values from RHS matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002670 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002671
2672 // Accumulate
2673 ARM_DOT_K0XN0(a0, b, c0);
2674#if M0 > 1
2675 ARM_DOT_K0XN0(a1, b, c1);
2676#endif // M0 > 1
2677#if M0 > 2
2678 ARM_DOT_K0XN0(a2, b, c2);
2679#endif // M0 > 2
2680#if M0 > 3
2681 ARM_DOT_K0XN0(a3, b, c3);
2682#endif // M0 > 3
2683#if M0 > 4
2684 ARM_DOT_K0XN0(a4, b, c4);
2685#endif // M0 > 4
2686#if M0 > 5
2687 ARM_DOT_K0XN0(a5, b, c5);
2688#endif // M0 > 5
2689#if M0 > 6
2690 ARM_DOT_K0XN0(a6, b, c6);
2691#endif // M0 > 6
2692#if M0 > 7
2693 ARM_DOT_K0XN0(a7, b, c7);
2694#endif // M0 > 7
2695
2696 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2697 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
2698 }
2699
2700 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2701
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002702 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002703
2704#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002705
2706 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01002707 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002708 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2709 // multiply dst_stride_z by DEPTH_GEMM3D
2710 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2711
2712#else // defined(REINTERPRET_OUTPUT_AS_3D)
2713
2714 // Add offset for batched GEMM
2715 dst_addr += get_global_id(2) * dst_stride_z;
2716
2717#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2718
2719 // Multiply by the weight of matrix-matrix product and store the result
2720#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01002721 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002722#endif // defined(ALPHA)
2723
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002724 // Add beta*bias
2725#if defined(BETA)
2726#if defined(BROADCAST_BIAS)
2727 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2728
2729 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2730
2731#ifndef UNIT_BETA
2732 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2733#endif // UNIT_BIAS
2734
2735 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002736#if defined(MIXED_PRECISION)
2737 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2738 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2739#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002740 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002741#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002742
2743#else // defined(BROADCAST_BIAS)
2744 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2745 2) * bias_stride_z;
2746
2747 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2748
2749#ifndef UNIT_BETA
2750 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2751#endif // UNIT_BIAS
2752
2753 // c = c + bias
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002754#if defined(MIXED_PRECISION)
2755 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2756 ADD_BLOCK(M0, c, bias_hp);
2757#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002758 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002759#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002760
2761#endif // defined(BROADCAST_BIAS)
2762#endif // defined(BETA)
2763
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002764#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002765#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002766 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002767#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002768 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002769#endif // defined(MIXED_PRECISION)
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002770#endif // defined(ACTIVATION_TYPE)
2771
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002772 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
2773 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
2774
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002775 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002776#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002777 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002778 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002779#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002780 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002781#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002782
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002783#undef LHS_BLOCK_SIZE
2784#undef LHS_OFFSET_X
2785#undef LHS_STEP_X
2786#undef RHS_BLOCK_SIZE
2787#undef RHS_OFFSET_X
2788#undef RHS_STEP_X
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002789#undef LHS_STEP_LOOP
2790#undef RHS_STEP_LOOP
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002791}
giuros01b3204e72019-04-01 13:50:22 +01002792
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002793#if defined(OPENCL_IMAGE_SUPPORT)
2794/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
2795 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2796 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2797 *
2798 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2799 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2800 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2801 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
2802 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2803 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002804 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2805 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2806 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002807 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2808 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2809 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2810 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2811 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002812 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2813 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002814 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2815 * - M0 = 2, 3, 4, 5, 6, 7, 8
2816 * - N0 = 4, 8, 16
2817 * - K0 = 4, 8, 16
2818 * - V0 >= 1
2819 * - H0 >= 1
2820 *
2821 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2822 * The activation function is performed after the bias addition
2823 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
2824 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2825 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2826 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2827 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2828 *
2829 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
2830 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2831 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2832 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2833 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2834 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2835 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2836 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2837 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2838 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2839 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2840 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2841 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2842 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2843 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2844 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2845 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2846 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2847 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002848 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002849 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2850 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2851 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2852 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2853 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2854 */
2855__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
2856 __read_only image2d_t rhs_img,
2857#if defined(BETA)
2858 IMAGE_DECLARATION(bias),
2859#endif // defined(BETA)
2860 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002861 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002862 uint lhs_stride_z,
2863 uint rhs_stride_z,
2864#if defined(BETA)
2865 uint bias_stride_z,
2866#endif //defined(BETA)
2867 uint dst_stride_z
2868#if defined(REINTERPRET_OUTPUT_AS_3D)
2869 ,
2870 uint dst_cross_plane_pad
2871#endif // REINTERPRET_OUTPUT_AS_3D
2872 )
2873{
2874 // Pixel unit
2875#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
2876
2877 // Block size
2878#define LHS_BLOCK_SIZE ((K0) * (M0))
2879
2880#if defined(LHS_INTERLEAVE)
2881#define LHS_OFFSET_X (K0)
2882#define LHS_STEP_X ((K0) * (V0))
2883#define LHS_STEP_LOOP (1)
2884#else // defined(INTERLEAVE)
2885#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2886#define LHS_STEP_X (K0)
2887#define LHS_STEP_LOOP (V0)
2888#endif // defined(INTERLEAVE)
2889
2890 // Block size
2891#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
2892
2893 // RHS offset and step X
2894#if defined(RHS_INTERLEAVE)
2895#define RHS_OFFSET_X (PIXEL_UNIT)
2896#define RHS_STEP_X (PIXEL_UNIT * (H0))
2897#define RHS_STEP_LOOP (1)
2898#else // defined(RHS_INTERLEAVE)
2899#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2900#define RHS_STEP_X PIXEL_UNIT
2901#define RHS_STEP_LOOP (H0)
2902#endif // defined(RHS_INTERLEAVE)
2903
2904#if defined(DUMMY_WORK_ITEMS)
2905 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2906 {
2907 return;
2908 }
2909#endif // defined(DUMMY_WORK_ITEMS)
2910
2911 // Compute LHS matrix address
2912 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2913 (get_global_id(2) * lhs_stride_z);
2914
2915#if defined(MATRIX_B_DEPTH)
2916 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2917 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
2918#else // defined(MATRIX_B_DEPTH)
2919 const uint z_rhs = get_global_id(2);
2920#endif // defined(MATRIX_B_DEPTH)
2921
2922 // Compute RHS matrix coordinates
2923 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
2924 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
2925
2926 // Initialize the accumulators
2927 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
2928
2929 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2930 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2931
2932 for(int i = 0; i < K; i += K0)
2933 {
2934 // Load values from LHS matrix
2935 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
2936
2937 // Load values from RHS matrix stored in a cl_image
2938 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
2939 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
2940
2941 // Accumulate
2942 ARM_DOT_K0XN0(a0, b, c0);
2943#if M0 > 1
2944 ARM_DOT_K0XN0(a1, b, c1);
2945#endif // M0 > 1
2946#if M0 > 2
2947 ARM_DOT_K0XN0(a2, b, c2);
2948#endif // M0 > 2
2949#if M0 > 3
2950 ARM_DOT_K0XN0(a3, b, c3);
2951#endif // M0 > 3
2952#if M0 > 4
2953 ARM_DOT_K0XN0(a4, b, c4);
2954#endif // M0 > 4
2955#if M0 > 5
2956 ARM_DOT_K0XN0(a5, b, c5);
2957#endif // M0 > 5
2958#if M0 > 6
2959 ARM_DOT_K0XN0(a6, b, c6);
2960#endif // M0 > 6
2961#if M0 > 7
2962 ARM_DOT_K0XN0(a7, b, c7);
2963#endif // M0 > 7
2964
2965 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2966
2967 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
2968 }
2969
2970 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2971
2972 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
2973
2974#if defined(REINTERPRET_OUTPUT_AS_3D)
2975
2976 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2977 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2978 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2979 // multiply dst_stride_z by DEPTH_GEMM3D
2980 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2981
2982#else // defined(REINTERPRET_OUTPUT_AS_3D)
2983
2984 // Add offset for batched GEMM
2985 dst_addr += get_global_id(2) * dst_stride_z;
2986
2987#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2988
2989 // Multiply by the weight of matrix-matrix product and store the result
2990#if defined(ALPHA)
2991 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2992#endif // defined(ALPHA)
2993
2994 // Add beta*bias
2995#if defined(BETA)
2996#if defined(BROADCAST_BIAS)
2997 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2998
2999 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3000
3001#ifndef UNIT_BETA
3002 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3003#endif // UNIT_BIAS
3004
3005 // c = c + bias[broadcasted]
3006#if defined(MIXED_PRECISION)
3007 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3008 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3009#else // defined(MIXED_PRECISION)
3010 ADD_BLOCK_BROADCAST(M0, c, bias0);
3011#endif // defined(MIXED_PRECISION)
3012
3013#else // defined(BROADCAST_BIAS)
3014 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3015 2) * bias_stride_z;
3016
3017 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3018
3019#ifndef UNIT_BETA
3020 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3021#endif // UNIT_BIAS
3022
3023 // c = c + bias
3024#if defined(MIXED_PRECISION)
3025 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3026 ADD_BLOCK(M0, c, bias_hp);
3027#else // defined(MIXED_PRECISION)
3028 ADD_BLOCK(M0, c, bias);
3029#endif // defined(MIXED_PRECISION)
3030
3031#endif // defined(BROADCAST_BIAS)
3032#endif // defined(BETA)
3033
3034#if defined(ACTIVATION_TYPE)
3035#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003036 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003037#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003038 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003039#endif // defined(MIXED_PRECISION)
3040#endif // defined(ACTIVATION_TYPE)
3041
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003042 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3043 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3044
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003045 // Store output block
3046#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003047 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003048 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003049#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003050 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003051#endif // defined(MIXED_PRECISION)
3052
3053#undef LHS_BLOCK_SIZE
3054#undef LHS_OFFSET_X
3055#undef LHS_STEP_X
3056#undef RHS_BLOCK_SIZE
3057#undef RHS_OFFSET_X
3058#undef RHS_STEP_X
3059#undef PIXEL_UNIT
3060#undef LHS_STEP_LOOP
3061#undef RHS_STEP_LOOP
3062}
3063#endif // defined(OPENCL_IMAGE_SUPPORT)
3064
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003065#if defined(LHS_TRANSPOSE)
3066
3067#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
3068
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003069#if defined(MIXED_PRECISION)
3070
3071#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3072#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003073#else // GPU_ARCH == GPU_ARCH_MIDGARD
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003074#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003075#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3076
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003077#else // defined(MIXED_PRECISION
3078
3079#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3080#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
3081#else // GPU_ARCH == GPU_ARCH_MIDGARD
3082#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
3083#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3084
3085#endif // defined(MIXED_PRECISION)
3086
3087#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
3088 ({ \
3089 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003090 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003091#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
3092 ({ \
3093 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
3094 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003095 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003096#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
3097 ({ \
3098 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
3099 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003100 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003101#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
3102 ({ \
3103 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
3104 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003105 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003106#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
3107 ({ \
3108 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
3109 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
3110 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
3111 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
3112 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003113 })
3114
3115// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
3116// a is the column-vector (transposed)
3117// b is the row-vector (not transposed)
3118// C is the output matrix
3119// Lower case is a vector (a, b)
3120// Upper case is a matrix (C)
3121#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
3122
3123#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
3124 ({ \
3125 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
3126 })
3127#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
3128 ({ \
3129 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
3130 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
3131 })
3132#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
3133 ({ \
3134 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
3135 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
3136 })
3137#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
3138 ({ \
3139 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
3140 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
3141 })
3142#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
3143 ({ \
3144 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
3145 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
3146 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
3147 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
3148 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
3149 })
3150#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
3151 ({ \
3152 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
3153 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
3154 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
3155 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
3156 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
3157 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
3158 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
3159 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
3160 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
3161 })
3162
3163// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
3164// The dimensions for this matrix multiplications are defined through M0, N0 and K0
3165// The dimensions supported are:
3166// M0: 1, 2, 3, 4, 8
3167// N0: 1, 2, 3, 4, 8, 16
3168// K0: 1, 2, 3, 4, 8, 16
3169// This macro calls the vector-by-matrix macro K0 times
3170// A, B and C are matrices
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003171#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
3172 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003173 (M0, N0, TYPE, A, B, C)
3174
3175/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3176 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3177 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3178 *
3179 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
3180 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003181 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003182 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3183 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3184 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3185 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3186 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003187 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3188 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003189 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3190 * - M0 = 2, 3, 4, 8
3191 * - N0 = 2, 3, 4, 8, 16
3192 * - K0 = 2, 3, 4, 8, 16
3193 * - V0 >= 1
3194 * - H0 >= 1
3195 *
3196 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3197 * The activation function is performed after the bias addition
3198 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3199 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3200 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3201 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3202 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3203 *
3204 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
3205 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3206 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3207 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3208 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3209 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3210 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
3211 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
3212 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3213 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
3214 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3215 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
3216 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3217 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3218 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3219 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3220 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3221 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3222 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3223 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3224 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3225 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3226 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3227 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003228 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003229 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3230 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3231 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3232 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3233 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3234 */
3235__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
3236 IMAGE_DECLARATION(rhs),
3237#if defined(BETA)
3238 IMAGE_DECLARATION(bias),
3239#endif // defined(BETA)
3240 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003241 uint k,
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003242 uint lhs_stride_z,
3243 uint rhs_stride_z,
3244#if defined(BETA)
3245 uint bias_stride_z,
3246#endif //defined(BETA)
3247 uint dst_stride_z
3248#if defined(REINTERPRET_OUTPUT_AS_3D)
3249 ,
3250 uint dst_cross_plane_pad
3251#endif // REINTERPRET_OUTPUT_AS_3D
3252 )
3253{
3254 // Block size
3255#define LHS_BLOCK_SIZE ((K0) * (M0))
3256
3257#if defined(LHS_INTERLEAVE)
3258#define LHS_OFFSET_X (M0)
3259#define LHS_STEP_X ((M0) * (V0))
3260#define LHS_STEP_LOOP (1)
3261#else // defined(INTERLEAVE)
3262#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3263#define LHS_STEP_X (M0)
3264#define LHS_STEP_LOOP (V0)
3265#endif // defined(INTERLEAVE)
3266
3267 // Block size
3268#define RHS_BLOCK_SIZE ((K0) * (N0))
3269
3270 // RHS offset and step X
3271#if defined(RHS_INTERLEAVE)
3272#define RHS_OFFSET_X (N0)
3273#define RHS_STEP_X ((N0) * (H0))
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003274#else // defined(RHS_INTERLEAVE)
3275#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3276#define RHS_STEP_X (N0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003277#endif // defined(RHS_INTERLEAVE)
3278
3279 const uint x = get_global_id(0);
3280 const uint y = get_global_id(1);
3281 const uint z = get_global_id(2);
3282
3283#if defined(DUMMY_WORK_ITEMS)
3284 if((x * N0 >= N) || (y * M0 >= M))
3285 {
3286 return;
3287 }
3288#endif // defined(DUMMY_WORK_ITEMS)
3289
3290 // Compute LHS matrix address
3291 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3292
3293 // Compute RHS matrix address
3294 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
3295
3296#if defined(MATRIX_B_DEPTH)
3297 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3298 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
3299#else // defined(MATRIX_B_DEPTH)
3300 rhs_addr += z * rhs_stride_z;
3301#endif // defined(MATRIX_B_DEPTH)
3302
3303 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003304 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003305
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003306 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3307
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003308 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3309 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
3310
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003311 for(int i = 0; i < k; i += K0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003312 {
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003313 VEC_DATA_TYPE(DATA_TYPE, M0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003314 a0;
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003315 VEC_DATA_TYPE(DATA_TYPE, N0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003316 b0;
3317
3318 a0 = VLOAD(M0)(0, lhs);
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003319 b0 = VLOAD(N0)(0, rhs);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003320
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003321 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003322
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003323 lhs += LHS_STEP_X;
3324 rhs += RHS_STEP_X;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003325
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003326#if K0 > 1
3327 a0 = VLOAD(M0)(0, lhs);
3328 b0 = VLOAD(N0)(0, rhs);
3329
3330 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3331
3332 lhs += LHS_STEP_X;
3333 rhs += RHS_STEP_X;
3334#endif // K0 > 1
3335
3336#if K0 > 2
3337 a0 = VLOAD(M0)(0, lhs);
3338 b0 = VLOAD(N0)(0, rhs);
3339
3340 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3341
3342 lhs += LHS_STEP_X;
3343 rhs += RHS_STEP_X;
3344#endif // K0 > 2
3345
3346#if K0 > 3
3347 a0 = VLOAD(M0)(0, lhs);
3348 b0 = VLOAD(N0)(0, rhs);
3349
3350 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3351
3352 lhs += LHS_STEP_X;
3353 rhs += RHS_STEP_X;
3354#endif // K0 > 3
3355
3356#if K0 > 4
3357 a0 = VLOAD(M0)(0, lhs);
3358 b0 = VLOAD(N0)(0, rhs);
3359
3360 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3361
3362 lhs += LHS_STEP_X;
3363 rhs += RHS_STEP_X;
3364
3365 a0 = VLOAD(M0)(0, lhs);
3366 b0 = VLOAD(N0)(0, rhs);
3367
3368 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3369
3370 lhs += LHS_STEP_X;
3371 rhs += RHS_STEP_X;
3372
3373 a0 = VLOAD(M0)(0, lhs);
3374 b0 = VLOAD(N0)(0, rhs);
3375
3376 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3377
3378 lhs += LHS_STEP_X;
3379 rhs += RHS_STEP_X;
3380
3381 a0 = VLOAD(M0)(0, lhs);
3382 b0 = VLOAD(N0)(0, rhs);
3383
3384 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3385
3386 lhs += LHS_STEP_X;
3387 rhs += RHS_STEP_X;
3388#endif // K0 > 4
3389
3390#if K0 > 8
3391 a0 = VLOAD(M0)(0, lhs);
3392 b0 = VLOAD(N0)(0, rhs);
3393
3394 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3395
3396 lhs += LHS_STEP_X;
3397 rhs += RHS_STEP_X;
3398
3399 a0 = VLOAD(M0)(0, lhs);
3400 b0 = VLOAD(N0)(0, rhs);
3401
3402 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3403
3404 lhs += LHS_STEP_X;
3405 rhs += RHS_STEP_X;
3406
3407 a0 = VLOAD(M0)(0, lhs);
3408 b0 = VLOAD(N0)(0, rhs);
3409
3410 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3411
3412 lhs += LHS_STEP_X;
3413 rhs += RHS_STEP_X;
3414
3415 a0 = VLOAD(M0)(0, lhs);
3416 b0 = VLOAD(N0)(0, rhs);
3417
3418 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3419
3420 lhs += LHS_STEP_X;
3421 rhs += RHS_STEP_X;
3422
3423 a0 = VLOAD(M0)(0, lhs);
3424 b0 = VLOAD(N0)(0, rhs);
3425
3426 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3427
3428 lhs += LHS_STEP_X;
3429 rhs += RHS_STEP_X;
3430
3431 a0 = VLOAD(M0)(0, lhs);
3432 b0 = VLOAD(N0)(0, rhs);
3433
3434 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3435
3436 lhs += LHS_STEP_X;
3437 rhs += RHS_STEP_X;
3438
3439 a0 = VLOAD(M0)(0, lhs);
3440 b0 = VLOAD(N0)(0, rhs);
3441
3442 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3443
3444 lhs += LHS_STEP_X;
3445 rhs += RHS_STEP_X;
3446
3447 a0 = VLOAD(M0)(0, lhs);
3448 b0 = VLOAD(N0)(0, rhs);
3449
3450 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3451
3452 lhs += LHS_STEP_X;
3453 rhs += RHS_STEP_X;
3454#endif // K0 > 8
3455
3456#ifndef LHS_INTERLEAVE
3457 lhs += (M0 * K0 * (V0 - 1));
3458#endif // LHS_INTERLEAVE
3459
3460#ifndef RHS_INTERLEAVE
3461 rhs += (N0 * K0 * (H0 - 1));
3462#endif // RHS_INTERLEAVE
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003463 }
3464
3465 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3466
3467 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3468
3469#if defined(REINTERPRET_OUTPUT_AS_3D)
3470
3471 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3472 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3473 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3474 // multiply dst_stride_z by DEPTH_GEMM3D
3475 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3476
3477#else // defined(REINTERPRET_OUTPUT_AS_3D)
3478
3479 // Add offset for batched GEMM
3480 dst_addr += z * dst_stride_z;
3481
3482#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3483
3484 // Multiply by the weight of matrix-matrix product and store the result
3485#if defined(ALPHA)
3486 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3487#endif // defined(ALPHA)
3488
3489 // Add beta*bias
3490#if defined(BETA)
3491#if defined(BROADCAST_BIAS)
3492 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3493
3494 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3495
3496#ifndef UNIT_BETA
3497 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3498#endif // UNIT_BIAS
3499
3500 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003501#if defined(MIXED_PRECISION)
3502 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3503 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3504#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003505 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003506#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003507
3508#else // defined(BROADCAST_BIAS)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003509 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3510 2) * bias_stride_z;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003511
3512 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3513
3514#ifndef UNIT_BETA
3515 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3516#endif // UNIT_BIAS
3517
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003518#if defined(MIXED_PRECISION)
3519 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3520 ADD_BLOCK(M0, c, bias_hp);
3521#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003522 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003523#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003524
3525#endif // defined(BROADCAST_BIAS)
3526#endif // defined(BETA)
3527
3528#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003529#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003530 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003531#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003532 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003533#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003534#endif // defined(ACTIVATION_TYPE)
3535
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003536 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3537 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3538
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003539 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003540#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003541 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003542 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003543#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003544 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003545#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003546
3547#undef LHS_BLOCK_SIZE
3548#undef LHS_OFFSET_X
3549#undef LHS_STEP_X
3550#undef RHS_BLOCK_SIZE
3551#undef RHS_OFFSET_X
3552#undef RHS_STEP_X
3553}
3554
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003555#if defined(OPENCL_IMAGE_SUPPORT)
3556/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
3557 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3558 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3559 *
3560 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
3561 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003562 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
3563 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01003564 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
3565 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
3566 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003567 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3568 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3569 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3570 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3571 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003572 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3573 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003574 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3575 * - M0 = 2, 3, 4, 8
3576 * - N0 = 4, 8, 16
3577 * - K0 = 4, 8, 16
3578 * - V0 >= 1
3579 * - H0 >= 1
3580 *
3581 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3582 * The activation function is performed after the bias addition
3583 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3584 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3585 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3586 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3587 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3588 *
3589 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
3590 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3591 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3592 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3593 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3594 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3595 * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
3596 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3597 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3598 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3599 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3600 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3601 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3602 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3603 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3604 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3605 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3606 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3607 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003608 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003609 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3610 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3611 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3612 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3613 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3614 */
3615__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
3616 __read_only image2d_t rhs_img,
3617#if defined(BETA)
3618 IMAGE_DECLARATION(bias),
3619#endif // defined(BETA)
3620 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003621 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003622 uint lhs_stride_z,
3623 uint rhs_stride_z,
3624#if defined(BETA)
3625 uint bias_stride_z,
3626#endif //defined(BETA)
3627 uint dst_stride_z
3628#if defined(REINTERPRET_OUTPUT_AS_3D)
3629 ,
3630 uint dst_cross_plane_pad
3631#endif // REINTERPRET_OUTPUT_AS_3D
3632 )
3633{
3634 // Pixel unit
3635#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
3636
3637 // Block size
3638#define LHS_BLOCK_SIZE ((K0) * (M0))
3639
3640#if defined(LHS_INTERLEAVE)
3641#define LHS_OFFSET_X (M0)
3642#define LHS_STEP_X ((M0) * (V0))
3643#define LHS_STEP_LOOP (1)
3644#else // defined(INTERLEAVE)
3645#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3646#define LHS_STEP_X (M0)
3647#define LHS_STEP_LOOP (V0)
3648#endif // defined(INTERLEAVE)
3649
3650 // Block size
3651#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
3652
3653 // RHS offset and step X
3654#if defined(RHS_INTERLEAVE)
3655#define RHS_OFFSET_X (PIXEL_UNIT)
3656#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
3657#else // defined(RHS_INTERLEAVE)
3658#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3659#define RHS_STEP_X (PIXEL_UNIT)
3660#endif // defined(RHS_INTERLEAVE)
3661
3662 const uint x = get_global_id(0);
3663 const uint y = get_global_id(1);
3664 const uint z = get_global_id(2);
3665
3666#if defined(DUMMY_WORK_ITEMS)
3667 if((x * N0 >= N) || (y * M0 >= M))
3668 {
3669 return;
3670 }
3671#endif // defined(DUMMY_WORK_ITEMS)
3672
3673 // Compute LHS matrix address
3674 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3675
3676#if defined(MATRIX_B_DEPTH)
3677 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3678 const uint z_rhs = (z % MATRIX_B_DEPTH);
3679#else // defined(MATRIX_B_DEPTH)
3680 const uint z_rhs = z;
3681#endif // defined(MATRIX_B_DEPTH)
3682
3683 // Compute RHS matrix coordinates
3684 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
3685 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
3686
3687 // Initialize the accumulators
3688 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
3689
3690 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3691
3692 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3693
3694 for(int i = 0; i < K; i += K0)
3695 {
3696 VEC_DATA_TYPE(DATA_TYPE, M0)
3697 a0;
3698 VEC_DATA_TYPE(DATA_TYPE, N0)
3699 b0;
3700
3701 a0 = VLOAD(M0)(0, lhs);
3702 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
3703
3704 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3705
3706 lhs += LHS_STEP_X;
3707
3708#if K0 > 1
3709 a0 = VLOAD(M0)(0, lhs);
3710 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
3711
3712 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3713
3714 lhs += LHS_STEP_X;
3715#endif // K0 > 1
3716
3717#if K0 > 2
3718 a0 = VLOAD(M0)(0, lhs);
3719 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
3720
3721 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3722
3723 lhs += LHS_STEP_X;
3724#endif // K0 > 2
3725
3726#if K0 > 3
3727 a0 = VLOAD(M0)(0, lhs);
3728 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
3729
3730 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3731
3732 lhs += LHS_STEP_X;
3733#endif // K0 > 3
3734
3735#if K0 > 4
3736 a0 = VLOAD(M0)(0, lhs);
3737 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
3738
3739 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3740
3741 lhs += LHS_STEP_X;
3742
3743 a0 = VLOAD(M0)(0, lhs);
3744 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
3745
3746 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3747
3748 lhs += LHS_STEP_X;
3749
3750 a0 = VLOAD(M0)(0, lhs);
3751 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
3752
3753 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3754
3755 lhs += LHS_STEP_X;
3756
3757 a0 = VLOAD(M0)(0, lhs);
3758 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
3759
3760 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3761
3762 lhs += LHS_STEP_X;
3763#endif // K0 > 4
3764
3765#if K0 > 8
3766 a0 = VLOAD(M0)(0, lhs);
3767 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
3768
3769 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3770
3771 lhs += LHS_STEP_X;
3772
3773 a0 = VLOAD(M0)(0, lhs);
3774 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
3775
3776 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3777
3778 lhs += LHS_STEP_X;
3779
3780 a0 = VLOAD(M0)(0, lhs);
3781 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
3782
3783 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3784
3785 lhs += LHS_STEP_X;
3786
3787 a0 = VLOAD(M0)(0, lhs);
3788 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
3789
3790 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3791
3792 lhs += LHS_STEP_X;
3793
3794 a0 = VLOAD(M0)(0, lhs);
3795 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
3796
3797 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3798
3799 lhs += LHS_STEP_X;
3800
3801 a0 = VLOAD(M0)(0, lhs);
3802 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
3803
3804 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3805
3806 lhs += LHS_STEP_X;
3807
3808 a0 = VLOAD(M0)(0, lhs);
3809 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
3810
3811 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3812
3813 lhs += LHS_STEP_X;
3814
3815 a0 = VLOAD(M0)(0, lhs);
3816 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
3817
3818 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3819
3820 lhs += LHS_STEP_X;
3821#endif // K0 > 8
3822
3823#ifndef LHS_INTERLEAVE
3824 lhs += (M0 * K0 * (V0 - 1));
3825#endif // LHS_INTERLEAVE
3826
3827 x_rhs += K0 * RHS_STEP_X;
3828#ifndef RHS_INTERLEAVE
3829 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
3830#endif // RHS_INTERLEAVE
3831 }
3832
3833 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3834
3835 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3836
3837#if defined(REINTERPRET_OUTPUT_AS_3D)
3838
3839 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3840 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3841 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3842 // multiply dst_stride_z by DEPTH_GEMM3D
3843 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3844
3845#else // defined(REINTERPRET_OUTPUT_AS_3D)
3846
3847 // Add offset for batched GEMM
3848 dst_addr += z * dst_stride_z;
3849
3850#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3851
3852 // Multiply by the weight of matrix-matrix product and store the result
3853#if defined(ALPHA)
3854 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3855#endif // defined(ALPHA)
3856
3857 // Add beta*bias
3858#if defined(BETA)
3859#if defined(BROADCAST_BIAS)
3860 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3861
3862 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3863
3864#ifndef UNIT_BETA
3865 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3866#endif // UNIT_BIAS
3867
3868 // c = c + bias[broadcasted]
3869#if defined(MIXED_PRECISION)
3870 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3871 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3872#else // defined(MIXED_PRECISION)
3873 ADD_BLOCK_BROADCAST(M0, c, bias0);
3874#endif // defined(MIXED_PRECISION)
3875
3876#else // defined(BROADCAST_BIAS)
3877 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3878
3879 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3880
3881#ifndef UNIT_BETA
3882 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3883#endif // UNIT_BIAS
3884
3885#if defined(MIXED_PRECISION)
3886 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3887 ADD_BLOCK(M0, c, bias_hp);
3888#else // defined(MIXED_PRECISION)
3889 ADD_BLOCK(M0, c, bias);
3890#endif // defined(MIXED_PRECISION)
3891
3892#endif // defined(BROADCAST_BIAS)
3893#endif // defined(BETA)
3894
3895#if defined(ACTIVATION_TYPE)
3896#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003897 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003898#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003899 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003900#endif // defined(MIXED_PRECISION)
3901#endif // defined(ACTIVATION_TYPE)
3902
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003903 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3904 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3905
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003906 // Store output block
3907#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003908 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003909 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003910#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003911 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003912#endif // defined(MIXED_PRECISION)
3913
3914#undef LHS_BLOCK_SIZE
3915#undef LHS_OFFSET_X
3916#undef LHS_STEP_X
3917#undef RHS_BLOCK_SIZE
3918#undef RHS_OFFSET_X
3919#undef RHS_STEP_X
3920#undef PIXEL_UNIT
3921#undef LHS_STEP_LOOP
3922#undef RHS_STEP_LOOP
3923}
3924#endif // defined(OPENCL_IMAGE_SUPPORT)
3925
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003926#endif // defined(LHS_TRANSPOSE)
3927
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00003928#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
3929
giuros01b3204e72019-04-01 13:50:22 +01003930#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
3931
3932#define VFMA(a, b, c) \
3933 ({ \
3934 c = fma(a, b, c); \
3935 })
3936
3937#if M0 == 1
3938#define RHS_VFMA_M0xN0(i, a, b, c) \
3939 ({ \
3940 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3941 })
3942#elif M0 == 2 // M0 == 2
3943#define RHS_VFMA_M0xN0(i, a, b, c) \
3944 ({ \
3945 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3946 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3947 })
3948#elif M0 == 3 // M0 == 3
3949#define RHS_VFMA_M0xN0(i, a, b, c) \
3950 ({ \
3951 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3952 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3953 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3954 })
3955#elif M0 == 4 // M0 == 4
3956#define RHS_VFMA_M0xN0(i, a, b, c) \
3957 ({ \
3958 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3959 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3960 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3961 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3962 })
3963#elif M0 == 5 // M0 == 5
3964#define RHS_VFMA_M0xN0(i, a, b, c) \
3965 ({ \
3966 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3967 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3968 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3969 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3970 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3971 })
3972#elif M0 == 6 // M0 == 6
3973#define RHS_VFMA_M0xN0(i, a, b, c) \
3974 ({ \
3975 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3976 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3977 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3978 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3979 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3980 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3981 })
3982#elif M0 == 7 // M0 == 7
3983#define RHS_VFMA_M0xN0(i, a, b, c) \
3984 ({ \
3985 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3986 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3987 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3988 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3989 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3990 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3991 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3992 })
3993#elif M0 == 8 // M0 == 8
3994#define RHS_VFMA_M0xN0(i, a, b, c) \
3995 ({ \
3996 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3997 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3998 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3999 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
4000 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
4001 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
4002 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
4003 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
4004 })
4005#else // M0 not supported
4006#error "M0 not supported"
4007#endif // M0 not supported
4008
4009/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
4010 * The LHS matrix is NOT reshaped
4011 * The RHS matrix is NOT reshaped
4012 *
4013 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004014 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
4015 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
4016 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
4017 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
4018 * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
SiCong Li3a501662020-06-26 10:02:06 +01004019 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
4020 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
giuros01b3204e72019-04-01 13:50:22 +01004021 * @note Only the following configurations of M0, N0 and K0 are currently supported:
4022 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
4023 * - N0 = 2, 3, 4, 8, 16
4024 * - K0 = 2, 3, 4, 8, 16
4025 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004026 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004027 * The activation function is performed after the bias addition
giuros01b3204e72019-04-01 13:50:22 +01004028 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
4029 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
4030 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4031 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4032 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4033 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
4034 *
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004035 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
4036 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
4037 * @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)
4038 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
4039 * @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)
4040 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
4041 * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
4042 * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
4043 * @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)
4044 * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
4045 * @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)
4046 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004047 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4048 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4049 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
4050 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4051 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
4052 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
4053 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
4054 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4055 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
4056 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4057 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
4058 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4059 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
4060 * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
4061 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
4062 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4063 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
4064 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
giuros01b3204e72019-04-01 13:50:22 +01004065 */
4066__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
4067 IMAGE_DECLARATION(rhs),
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004068#if defined(BETA)
4069 IMAGE_DECLARATION(bias),
4070#endif // defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004071 IMAGE_DECLARATION(dst),
4072 uint lhs_stride_z,
4073 uint rhs_stride_z,
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004074#if defined(BETA)
4075 uint bias_stride_z,
4076#endif //defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004077 uint dst_stride_z
4078#if defined(REINTERPRET_INPUT_AS_3D)
4079 ,
4080 uint lhs_cross_plane_pad
4081#endif // REINTERPRET_INPUT_AS_3D
4082#if defined(REINTERPRET_OUTPUT_AS_3D)
4083 ,
4084 uint dst_cross_plane_pad
4085#endif // REINTERPRET_OUTPUT_AS_3D
4086 )
4087{
4088 // Block size
4089#define RHS_BLOCK_SIZE ((K0) * (N0))
4090
4091 // RHS offset and step X
4092#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
4093
4094 uint x = get_global_id(0);
4095 uint y = get_global_id(1);
4096 uint z = get_global_id(2);
4097
4098#if defined(DUMMY_WORK_ITEMS)
4099 if((x * N0 >= N) || (y * M0 >= M))
4100 {
4101 return;
4102 }
4103#endif // defined(DUMMY_WORK_ITEMS)
4104
4105 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01004106 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
giuros01b3204e72019-04-01 13:50:22 +01004107
4108 // Compute RHS matrix address
4109 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
4110
4111#if defined(MATRIX_B_DEPTH)
4112 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4113 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
4114#else // defined(MATRIX_B_DEPTH)
4115 rhs_offset += z * rhs_stride_z;
4116#endif // defined(MATRIX_B_DEPTH)
4117
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004118 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
4119 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
giuros01b3204e72019-04-01 13:50:22 +01004120
4121#if defined(REINTERPRET_INPUT_AS_3D)
4122 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4123 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
4124
4125 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4126 // multiply lhs_stride_z by DEPTH_GEMM3D
4127 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
4128
4129#else // defined(REINTERPRET_INPUT_AS_3D)
4130
4131 // Add offset for batched GEMM
4132 lhs_offset += z * lhs_stride_z;
4133
4134#endif // defined(REINTERPRET_INPUT_AS_3D)
4135
4136 // Initialize the accumulators
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004137 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
giuros01b3204e72019-04-01 13:50:22 +01004138
4139 int i = 0;
4140 for(; i <= (K - K0); i += K0)
4141 {
4142 // Supported cases (M0, K0):
4143 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
4144 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
4145 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
4146 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
4147 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
4148 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
4149 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
4150 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
4151 // Load values from LHS matrix
4152 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
4153
4154 // Load values from RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004155 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
giuros01b3204e72019-04-01 13:50:22 +01004156
4157 RHS_VFMA_M0xN0(0, a, b0, c);
4158 RHS_VFMA_M0xN0(1, a, b1, c);
4159#if K0 > 2
4160 RHS_VFMA_M0xN0(2, a, b2, c);
4161#endif // K0 > 2
4162#if K0 > 3
4163 RHS_VFMA_M0xN0(3, a, b3, c);
4164#endif // K0 > 3
4165#if K0 > 4
4166 RHS_VFMA_M0xN0(4, a, b4, c);
4167 RHS_VFMA_M0xN0(5, a, b5, c);
4168 RHS_VFMA_M0xN0(6, a, b6, c);
4169 RHS_VFMA_M0xN0(7, a, b7, c);
4170#endif // K0 > 4
4171#if K0 > 8
4172 RHS_VFMA_M0xN0(8, a, b8, c);
4173 RHS_VFMA_M0xN0(9, a, b9, c);
Gian Marco Iodice7b9d7ca2019-09-19 16:37:39 +01004174 RHS_VFMA_M0xN0(A, a, bA, c);
4175 RHS_VFMA_M0xN0(B, a, bB, c);
4176 RHS_VFMA_M0xN0(C, a, bC, c);
4177 RHS_VFMA_M0xN0(D, a, bD, c);
4178 RHS_VFMA_M0xN0(E, a, bE, c);
4179 RHS_VFMA_M0xN0(F, a, bF, c);
giuros01b3204e72019-04-01 13:50:22 +01004180#endif // K0 > 8
4181
4182 lhs_offset += K0 * sizeof(DATA_TYPE);
4183 rhs_offset += K0 * rhs_stride_y;
4184 }
4185
4186 // Left-over accumulations
4187 for(; i < K; ++i)
4188 {
4189 // Load values from LHS matrix
4190 VEC_DATA_TYPE(DATA_TYPE, 2)
4191 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
4192#if M0 > 1
4193 VEC_DATA_TYPE(DATA_TYPE, 2)
4194 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
4195#endif // M0 > 1
4196#if M0 > 2
4197 VEC_DATA_TYPE(DATA_TYPE, 2)
4198 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
4199#endif // M0 > 2
4200#if M0 > 3
4201 VEC_DATA_TYPE(DATA_TYPE, 2)
4202 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
4203#endif // M0 > 3
4204#if M0 > 4
4205 VEC_DATA_TYPE(DATA_TYPE, 2)
4206 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
4207#endif // M0 > 4
4208#if M0 > 5
4209 VEC_DATA_TYPE(DATA_TYPE, 2)
4210 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
4211#endif // M0 > 5
4212#if M0 > 6
4213 VEC_DATA_TYPE(DATA_TYPE, 2)
4214 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
4215#endif // M0 > 6
4216#if M0 > 7
4217 VEC_DATA_TYPE(DATA_TYPE, 2)
4218 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
4219#endif // M0 > 7
4220
4221 VEC_DATA_TYPE(DATA_TYPE, N0)
4222 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
4223 RHS_VFMA_M0xN0(0, a, b, c);
4224
4225 lhs_offset += sizeof(DATA_TYPE);
4226 rhs_offset += rhs_stride_y;
4227 }
4228
SiCong Li406a13f2020-07-15 12:09:58 +01004229 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004230
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004231 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
giuros01b3204e72019-04-01 13:50:22 +01004232
4233#if defined(REINTERPRET_OUTPUT_AS_3D)
4234 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4235 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
4236
4237 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4238 // multiply dst_stride_z by DEPTH_GEMM3D
4239 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
4240
4241#else // defined(REINTERPRET_OUTPUT_AS_3D)
4242
4243 // Add offset for batched GEMM
4244 dst_addr += z * dst_stride_z;
4245
4246#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4247
4248 // Multiply by the weight of matrix-matrix product and store the result
giuros01b3204e72019-04-01 13:50:22 +01004249#if defined(ALPHA)
4250 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
4251#endif // defined(ALPHA)
4252
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004253 // Add beta*bias
4254#if defined(BETA)
4255#if defined(BROADCAST_BIAS)
4256 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
4257
4258 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4259
4260#ifndef UNIT_BETA
4261 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
4262#endif // UNIT_BIAS
4263
4264 // c = c + bias[broadcasted]
4265 ADD_BLOCK_BROADCAST(M0, c, bias0);
4266
4267#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01004268 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004269
4270 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4271
4272#ifndef UNIT_BETA
4273 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
4274#endif // UNIT_BIAS
4275
4276 // c = c + bias
4277 ADD_BLOCK(M0, c, bias);
4278
4279#endif // defined(BROADCAST_BIAS)
4280#endif // defined(BETA)
4281
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004282#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01004283 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004284#endif // defined(ACTIVATION_TYPE)
4285
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01004286 const bool cond_y = y == 0;
4287 const bool cond_x = ((x + 1) * N0 >= N);
4288
giuros01b3204e72019-04-01 13:50:22 +01004289 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01004290 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
giuros01b3204e72019-04-01 13:50:22 +01004291
4292#undef RHS_BLOCK_SIZE
4293#undef RHS_OFFSET_X
4294#undef RHS_STEP_X
4295}
4296#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
4297
Gian Marco36a0a462018-01-12 10:21:40 +00004298#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004299/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004300 *
Gian Marco19835e52018-01-30 13:35:54 +00004301 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004302 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4303 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4304 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4305 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004306 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004307 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4308 * The activation function is performed after the bias addition
4309 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004310 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4311 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4312 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4313 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4314 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004315 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4316 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4317 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4318 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4319 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4320 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004321 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004322 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4323 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4324 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4325 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4326 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004327 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4328 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4329 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4330 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4331 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4332 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004333 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004334 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004335 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004336 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004337 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004338 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004339 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4340 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004341 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004342 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004343 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004344 */
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004345__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
4346 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004347#if defined(BETA)
4348 IMAGE_DECLARATION(src2),
4349#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004350 IMAGE_DECLARATION(dst),
4351 uint src0_stride_z,
4352 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004353#if defined(BETA)
4354 uint src2_stride_z,
4355#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004356 uint dst_stride_z
4357#if defined(REINTERPRET_OUTPUT_AS_3D)
4358 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004359 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004360#endif // REINTERPRET_OUTPUT_AS_3D
4361 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004362{
Gian Marco36a0a462018-01-12 10:21:40 +00004363 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4364 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004365 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004366
Gian Marco36a0a462018-01-12 10:21:40 +00004367 // Offset
4368 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4369 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004370
Gian Marco36a0a462018-01-12 10:21:40 +00004371 // src_addr_a = address of matrix A
4372 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004373 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4374 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4375
4376#if defined(MATRIX_B_DEPTH)
4377 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4378 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4379#else // defined(MATRIX_B_DEPTH)
4380 src1_addr_in_bytes += z * src1_stride_z;
4381#endif // defined(MATRIX_B_DEPTH)
4382
4383 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4384 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004385
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004386 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004387 __global float *src_end_addr_b = src_addr_b + COLS_B;
4388
4389 src_addr_a += offset_row_a;
4390 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004391
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004392 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004393 float4 c0 = 0.0f;
4394 float4 c1 = 0.0f;
4395 float4 c2 = 0.0f;
4396 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004397
Gian Marco36a0a462018-01-12 10:21:40 +00004398 for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004399 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004400 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004401 float4 a0 = vload4(0, src_addr_a);
4402 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004403
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004404 c0 += (float4)a0.s0 * b0;
4405 c1 += (float4)a0.s1 * b0;
4406 c2 += (float4)a0.s2 * b0;
4407 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004408
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004409 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004410 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4411 b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004412
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004413 c0 += (float4)a0.s0 * b0;
4414 c1 += (float4)a0.s1 * b0;
4415 c2 += (float4)a0.s2 * b0;
4416 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004417 }
4418
Gian Marco36a0a462018-01-12 10:21:40 +00004419 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004420 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004421 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004422 float4 a0 = vload4(0, src_addr_a);
4423 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004424
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004425 c0 += (float4)a0.s0 * b0;
4426 c1 += (float4)a0.s1 * b0;
4427 c2 += (float4)a0.s2 * b0;
4428 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004429 }
4430
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004431 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004432 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4433
Gian Marcoae2af742018-02-15 12:35:44 +00004434 // Compute dst address
4435 __global uchar *dst_addr = offset(&dst, 0, 0);
4436
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004437 uint4 zout = 0;
4438
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004439#if defined(REINTERPRET_OUTPUT_AS_3D)
4440 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004441 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004442 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004443 // | |
4444 // | plane0 |
4445 // | |
4446 // |__________________|
4447 // |******************|
4448 // | cross_plane_pad |
4449 // |******************|
4450 // | |
4451 // | plane1 |
4452 // | |
4453 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004454
4455 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004456 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4457 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004458
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004459 // Add offset due to the cross plane paddings
4460 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004461
4462 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4463 // multiply dst_stride_z by DEPTH_GEMM3D
4464 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004465#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004466 // Add offset for batched GEMM
4467 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004468#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4469
4470 // Multiply by the weight of matrix-matrix product and store the result
4471#if defined(ALPHA)
4472 SCALE_BLOCK(4, float, c, ALPHA);
4473#endif // defined(ALPHA)
4474
4475 // Add beta*bias
4476#if defined(BETA)
4477 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4478
4479#if defined(BROADCAST_BIAS)
4480 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4481
4482 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4483
4484#ifndef UNIT_BETA
4485 SCALE_BLOCK(1, float, bias, BETA);
4486#endif // UNIT_BIAS
4487
4488 // c = c + bias[broadcasted]
4489 ADD_BLOCK_BROADCAST(4, c, bias0);
4490
4491#else // defined(BROADCAST_BIAS)
4492 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4493 2) * src2_stride_z;
4494
4495 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4496
4497#ifndef UNIT_BETA
4498 SCALE_BLOCK(4, float, bias, BETA);
4499#endif // UNIT_BIAS
4500
4501 // c = c + bias
4502 ADD_BLOCK(4, c, bias);
4503
4504#endif // defined(BROADCAST_BIAS)
4505#endif // defined(BETA)
4506
4507#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01004508 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004509#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004510
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004511 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004512 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4513 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4514 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4515 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004516}
4517
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004518/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004519 *
Gian Marco19835e52018-01-30 13:35:54 +00004520 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004521 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4522 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4523 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4524 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4525 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004526 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004527 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4528 * The activation function is performed after the bias addition
4529 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004530 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4531 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4532 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4533 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4534 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004535 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4536 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4537 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4538 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4539 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4540 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004541 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004542 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4543 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4544 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4545 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4546 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004547 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4548 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4549 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4550 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4551 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4552 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004553 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004554 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004555 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004556 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004557 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004559 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4560 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004561 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004562 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004563 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004564 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004565__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
4566 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004567#if defined(BETA)
4568 IMAGE_DECLARATION(src2),
4569#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004570 IMAGE_DECLARATION(dst),
4571 uint src0_stride_z,
4572 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004573#if defined(BETA)
4574 uint src2_stride_z,
4575#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004576 uint dst_stride_z
4577#if defined(REINTERPRET_OUTPUT_AS_3D)
4578 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004579 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004580#endif // REINTERPRET_OUTPUT_AS_3D
4581 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004582{
Gian Marco36a0a462018-01-12 10:21:40 +00004583 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4584 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004585 int z = get_global_id(2);
Gian Marco36a0a462018-01-12 10:21:40 +00004586
4587 // Offset
4588 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4589 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
4590
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004591 // src_addr_a = address of matrix A
4592 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004593 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4594 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4595
4596#if defined(MATRIX_B_DEPTH)
4597 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4598 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4599#else // defined(MATRIX_B_DEPTH)
4600 src1_addr_in_bytes += z * src1_stride_z;
4601#endif // defined(MATRIX_B_DEPTH)
4602
4603 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4604 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004605
Gian Marco36a0a462018-01-12 10:21:40 +00004606 src_addr_a += offset_row_a;
4607 src_addr_b += offset_row_b;
4608
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004609 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004610 float4 c0 = 0.0f;
4611 float4 c1 = 0.0f;
4612 float4 c2 = 0.0f;
4613 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004614
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004615#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
4616
4617 int i = 0;
4618 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004619 {
4620 // Load values from matrix A (interleaved) and matrix B (transposed)
4621 float4 a0 = vload4(0, src_addr_a);
4622 float4 b0 = vload4(0, src_addr_b);
4623
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004624 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4625 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004626
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004627 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4628 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4629 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4630 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004631
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004632 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4633 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4634 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4635 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004636
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004637 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4638 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4639 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4640 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004641
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004642 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4643 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4644 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4645 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004646
4647 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004648 a0 = vload4(0, src_addr_a);
4649 b0 = vload4(0, src_addr_b);
4650
4651 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4652 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004653
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004654 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4655 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4656 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4657 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004658
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004659 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4660 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4661 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4662 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004663
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004664 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4665 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4666 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4667 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004668
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004669 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4670 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4671 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4672 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004673
4674 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004675 a0 = vload4(0, src_addr_a);
4676 b0 = vload4(0, src_addr_b);
4677
4678 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4679 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4680
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004681 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4682 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4683 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4684 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004685
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004686 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4687 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4688 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4689 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004690
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004691 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4692 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4693 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4694 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004695
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004696 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4697 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4698 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4699 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004700
4701 // Load values from matrix A (interleaved) and matrix B (transposed)
4702 a0 = vload4(0, src_addr_a);
4703 b0 = vload4(0, src_addr_b);
4704
4705 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4706 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004707
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004708 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4709 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4710 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4711 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004712
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004713 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4714 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4715 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4716 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004717
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004718 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4719 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4720 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4721 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004722
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004723 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4724 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4725 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4726 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004727 }
4728
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004729 for(; i < (int)(COLS_MTX_B); ++i)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004730 {
4731 // Load values from matrix A (interleaved) and matrix B (transposed)
4732 float4 a0 = vload4(0, src_addr_a);
4733 float4 b0 = vload4(0, src_addr_b);
4734
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004735 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4736 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4737
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004738 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4739 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4740 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4741 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004742
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004743 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4744 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4745 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4746 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004747
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004748 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4749 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4750 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4751 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004752
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004753 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4754 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4755 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4756 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004757 }
4758
4759 // Compute destination address
4760 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4761
Gian Marcoae2af742018-02-15 12:35:44 +00004762 // Compute dst address
4763 __global uchar *dst_addr = offset(&dst, 0, 0);
4764
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004765 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004766
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004767#if defined(REINTERPRET_OUTPUT_AS_3D)
4768 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004769 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004770 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004771 // | |
4772 // | plane0 |
4773 // | |
4774 // |__________________|
4775 // |******************|
4776 // | cross_plane_pad |
4777 // |******************|
4778 // | |
4779 // | plane1 |
4780 // | |
4781 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004782
4783 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004784 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4785 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004786
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004787 // Add offset due to the cross plane paddings
4788 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004789
4790 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4791 // multiply dst_stride_z by DEPTH_GEMM3D
4792 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004793#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004794 // Add offset for batched GEMM
4795 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004796#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4797
4798 // Multiply by the weight of matrix-matrix product and store the result
4799#if defined(ALPHA)
4800 SCALE_BLOCK(4, float, c, ALPHA);
4801#endif // defined(ALPHA)
4802
4803 // Add beta*bias
4804#if defined(BETA)
4805 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4806
4807#if defined(BROADCAST_BIAS)
4808 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4809
4810 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4811
4812#ifndef UNIT_BETA
4813 SCALE_BLOCK(1, float, bias, BETA);
4814#endif // UNIT_BIAS
4815
4816 // c = c + bias[broadcasted]
4817 ADD_BLOCK_BROADCAST(4, c, bias0);
4818
4819#else // defined(BROADCAST_BIAS)
4820 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4821 2) * src2_stride_z;
4822
4823 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4824
4825#ifndef UNIT_BETA
4826 SCALE_BLOCK(4, float, bias, BETA);
4827#endif // UNIT_BIAS
4828
4829 // c = c + bias
4830 ADD_BLOCK(4, c, bias);
4831
4832#endif // defined(BROADCAST_BIAS)
4833#endif // defined(BETA)
4834
4835#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01004836 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004837#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004838
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004839 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004840 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4841 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4842 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4843 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004844}
4845
Georgios Pinitas84225582018-05-14 12:00:05 +01004846// Undefine local defines
4847#undef COLS_MTX_B
4848
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01004849#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004850/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004851 *
Gian Marco19835e52018-01-30 13:35:54 +00004852 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004853 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4854 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4855 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4856 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004857 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004858 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4859 * The activation function is performed after the bias addition
4860 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004861 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4862 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4863 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4864 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4865 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004866 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
4867 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4868 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4869 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4870 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4871 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004872 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004873 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4874 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4875 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4876 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4877 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004878 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4879 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4880 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4881 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4882 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4883 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004884 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004885 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004886 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004887 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004888 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004889 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004890 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4891 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004892 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004893 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004894 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004895 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004896__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
4897 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004898#if defined(BETA)
4899 IMAGE_DECLARATION(src2),
4900#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004901 IMAGE_DECLARATION(dst),
4902 uint src0_stride_z,
4903 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004904#if defined(BETA)
4905 uint src2_stride_z,
4906#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004907 uint dst_stride_z
4908#if defined(REINTERPRET_OUTPUT_AS_3D)
4909 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004910 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004911#endif // REINTERPRET_OUTPUT_AS_3D
4912 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004913{
Gian Marco36a0a462018-01-12 10:21:40 +00004914 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4915 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004916 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004917
Gian Marco36a0a462018-01-12 10:21:40 +00004918 // Offset
4919 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4920 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004921
Gian Marco36a0a462018-01-12 10:21:40 +00004922 // src_addr_a = address of matrix A
4923 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004924 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4925 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4926
4927#if defined(MATRIX_B_DEPTH)
4928 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4929 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4930#else // defined(MATRIX_B_DEPTH)
4931 src1_addr_in_bytes += z * src1_stride_z;
4932#endif // defined(MATRIX_B_DEPTH)
4933
4934 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
4935 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004936
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004937 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004938 __global half *src_end_addr_b = src_addr_b + COLS_B;
4939
4940 src_addr_a += offset_row_a;
4941 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004942
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004943 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004944 half8 c0 = 0.0f;
4945 half8 c1 = 0.0f;
4946 half8 c2 = 0.0f;
4947 half8 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004948
Gian Marco36a0a462018-01-12 10:21:40 +00004949 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004950 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004951 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004952 half4 a0 = vload4(0, src_addr_a);
4953 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004954
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004955 c0 += (half8)a0.s0 * b0;
4956 c1 += (half8)a0.s1 * b0;
4957 c2 += (half8)a0.s2 * b0;
4958 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004959
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004960 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004961 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4962 b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004963
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004964 c0 += (half8)a0.s0 * b0;
4965 c1 += (half8)a0.s1 * b0;
4966 c2 += (half8)a0.s2 * b0;
4967 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004968 }
4969
Gian Marco36a0a462018-01-12 10:21:40 +00004970 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004971 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004972 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004973 half4 a0 = vload4(0, src_addr_a);
4974 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004975
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004976 c0 += (half8)a0.s0 * b0;
4977 c1 += (half8)a0.s1 * b0;
4978 c2 += (half8)a0.s2 * b0;
4979 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004980 }
4981
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004982 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004983 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4984
Gian Marcoae2af742018-02-15 12:35:44 +00004985 // Compute dst address
4986 __global uchar *dst_addr = offset(&dst, 0, 0);
4987
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004988 uint4 zout = 0;
4989
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004990#if defined(REINTERPRET_OUTPUT_AS_3D)
4991 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004992 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004993 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004994 // | |
4995 // | plane0 |
4996 // | |
4997 // |__________________|
4998 // |******************|
4999 // | cross_plane_pad |
5000 // |******************|
5001 // | |
5002 // | plane1 |
5003 // | |
5004 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005005
5006 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005007 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5008 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005009
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005010 // Add offset due to the cross plane paddings
5011 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005012
5013 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5014 // multiply dst_stride_z by DEPTH_GEMM3D
5015 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005016#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00005017 // Add offset for batched GEMM
5018 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005019#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5020
5021 // Multiply by the weight of matrix-matrix product and store the result
5022#if defined(ALPHA)
5023 SCALE_BLOCK(4, half, c, ALPHA);
5024#endif // defined(ALPHA)
5025
5026 // Add beta*bias
5027#if defined(BETA)
5028 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5029
5030#if defined(BROADCAST_BIAS)
5031 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5032
5033 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5034
5035#ifndef UNIT_BETA
5036 SCALE_BLOCK(1, half, bias, BETA);
5037#endif // UNIT_BIAS
5038
5039 // c = c + bias[broadcasted]
5040 ADD_BLOCK_BROADCAST(4, c, bias0);
5041
5042#else // defined(BROADCAST_BIAS)
5043
5044 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5045 2) * src2_stride_z;
5046
5047 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5048
5049#ifndef UNIT_BETA
5050 SCALE_BLOCK(4, half, bias, BETA);
5051#endif // UNIT_BIAS
5052
5053 // c = c + bias
5054 ADD_BLOCK(4, c, bias);
5055
5056#endif // defined(BROADCAST_BIAS)
5057#endif // defined(BETA)
5058
5059#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01005060 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005061#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00005062
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005063 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005064 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5065 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5066 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5067 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005068}
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005069
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005070/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005071 *
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005072 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005073 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5074 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5075 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5076 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005077 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005078 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5079 * The activation function is performed after the bias addition
5080 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005081 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5082 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5083 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5084 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5085 *
5086 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5087 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5088 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5089 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5090 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5091 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5092 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5093 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5094 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5095 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5096 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5097 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005098 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5099 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5100 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5101 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5102 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5103 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005104 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5105 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5106 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5107 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5108 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5109 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
5110 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5111 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005112 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005113 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
5114 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
5115 */
5116__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
5117 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005118#if defined(BETA)
5119 IMAGE_DECLARATION(src2),
5120#endif // defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005121 IMAGE_DECLARATION(dst),
5122 uint src0_stride_z,
5123 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005124#if defined(BETA)
5125 uint src2_stride_z,
5126#endif //defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005127 uint dst_stride_z
5128#if defined(REINTERPRET_OUTPUT_AS_3D)
5129 ,
5130 uint cross_plane_pad
5131#endif // REINTERPRET_OUTPUT_AS_3D
5132 )
5133{
5134 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5135 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5136 int z = get_global_id(2);
5137
5138 // Offset
5139 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5140 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5141
5142 // src_addr_a = address of matrix A
5143 // src_addr_b = address of matrix B
5144 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5145 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5146
5147#if defined(MATRIX_B_DEPTH)
5148 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5149 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5150#else // defined(MATRIX_B_DEPTH)
5151 src1_addr_in_bytes += z * src1_stride_z;
5152#endif // defined(MATRIX_B_DEPTH)
5153
5154 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5155 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5156
5157 // Compute end row address for matrix B
5158 __global half *src_end_addr_b = src_addr_b + COLS_B;
5159
5160 src_addr_a += offset_row_a;
5161 src_addr_b += offset_row_b;
5162
5163 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005164 float8 c0 = 0.0f;
5165 float8 c1 = 0.0f;
5166 float8 c2 = 0.0f;
5167 float8 c3 = 0.0f;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005168
5169 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
5170 {
5171 // Load values from matrix A (interleaved) and matrix B (transposed)
5172 float4 a0 = convert_float4(vload4(0, src_addr_a));
5173 float8 b0 = convert_float8(vload8(0, src_addr_b));
5174
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005175 c0 += (float8)a0.s0 * b0;
5176 c1 += (float8)a0.s1 * b0;
5177 c2 += (float8)a0.s2 * b0;
5178 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005179
5180 // Load values from matrix A (interleaved) and matrix B (transposed)
5181 a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
5182 b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
5183
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005184 c0 += (float8)a0.s0 * b0;
5185 c1 += (float8)a0.s1 * b0;
5186 c2 += (float8)a0.s2 * b0;
5187 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005188 }
5189
5190 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
5191 {
5192 // Load values from matrix A (interleaved) and matrix B (transposed)
5193 float4 a0 = convert_float4(vload4(0, src_addr_a));
5194 float8 b0 = convert_float8(vload8(0, src_addr_b));
5195
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005196 c0 += (float8)a0.s0 * b0;
5197 c1 += (float8)a0.s1 * b0;
5198 c2 += (float8)a0.s2 * b0;
5199 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005200 }
5201
5202 // Compute destination address
5203 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5204
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005205 // Compute dst address
5206 __global uchar *dst_addr = offset(&dst, 0, 0);
5207
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005208 uint4 zout = 0;
5209
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005210#if defined(REINTERPRET_OUTPUT_AS_3D)
5211 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
5212 // in order to take into account the presence of possible cross plane paddings
5213 //
5214 // | |
5215 // | plane0 |
5216 // | |
5217 // |__________________|
5218 // |******************|
5219 // | cross_plane_pad |
5220 // |******************|
5221 // | |
5222 // | plane1 |
5223 // | |
5224 // |__________________|
5225
5226 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005227 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5228 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005229
5230 // Add offset due to the cross plane paddings
5231 zout *= (cross_plane_pad * dst_stride_y);
5232
5233 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5234 // multiply dst_stride_z by DEPTH_GEMM3D
5235 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005236#else // defined(REINTERPRET_OUTPUT_AS_3D)
5237 // Add offset for batched GEMM
5238 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005239#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5240
5241 // Multiply by the weight of matrix-matrix product and store the result
5242#if defined(ALPHA)
5243 SCALE_BLOCK(4, float, c, ALPHA);
5244#endif // defined(ALPHA)
5245
5246#if defined(BETA)
5247 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5248
5249#if defined(BROADCAST_BIAS)
5250 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5251
5252 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5253
5254 float8 bias_f0 = convert_float8(bias0);
5255
5256#ifndef UNIT_BETA
5257 SCALE_BLOCK(1, float, bias_f, BETA);
5258#endif // UNIT_BIAS
5259
5260 // c = c + bias[broadcasted]
5261 ADD_BLOCK_BROADCAST(4, c, bias_f0);
5262
5263#else // defined(BROADCAST_BIAS)
5264 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5265 2) * src2_stride_z;
5266
5267 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5268
5269 float8 bias_f0 = convert_float8(bias0);
5270 float8 bias_f1 = convert_float8(bias1);
5271 float8 bias_f2 = convert_float8(bias2);
5272 float8 bias_f3 = convert_float8(bias3);
5273
5274#ifndef UNIT_BETA
5275 SCALE_BLOCK(4, float, bias_f, BETA);
5276#endif // UNIT_BIAS
5277
5278 // c = c + bias
5279 ADD_BLOCK(4, c, bias_f);
5280
5281#endif // defined(BROADCAST_BIAS)
5282#endif // defined(BETA)
5283
5284 half8 c_h0 = convert_half8(c0);
5285 half8 c_h1 = convert_half8(c1);
5286 half8 c_h2 = convert_half8(c2);
5287 half8 c_h3 = convert_half8(c3);
5288
5289#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01005290 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005291#endif // defined(ACTIVATION_TYPE)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005292
5293 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005294 vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5295 vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5296 vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5297 vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005298}
5299
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005300/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005301 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005302 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005303 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5304 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5305 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5306 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005307 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005308 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5309 * The activation function is performed after the bias addition
5310 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005311 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5312 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5313 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5314 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5315 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005316 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5317 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5318 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5319 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5320 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5321 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5322 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5323 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5324 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5325 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5326 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5327 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005328 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5329 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5330 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5331 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5332 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5333 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005334 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5335 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5336 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5337 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5338 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5339 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005340 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5341 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
5342 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005343 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005344 */
5345__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
5346 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005347#if defined(BETA)
5348 IMAGE_DECLARATION(src2),
5349#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005350 IMAGE_DECLARATION(dst),
5351 uint src0_stride_z,
5352 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005353#if defined(BETA)
5354 uint src2_stride_z,
5355#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005356 uint dst_stride_z
5357#if defined(REINTERPRET_OUTPUT_AS_3D)
5358 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005359 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005360#endif // REINTERPRET_OUTPUT_AS_3D
5361 )
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005362{
5363 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5364 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5365 int z = get_global_id(2);
5366
5367 // Offset
5368 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5369 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5370
5371 // src_addr_a = address of matrix A
5372 // src_addr_b = address of matrix B
5373 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5374 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5375
5376#if defined(MATRIX_B_DEPTH)
5377 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5378 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5379#else // defined(MATRIX_B_DEPTH)
5380 src1_addr_in_bytes += z * src1_stride_z;
5381#endif // defined(MATRIX_B_DEPTH)
5382
5383 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5384 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5385
5386 // Compute end row address for matrix B
5387 __global half *src_end_addr_b = src_addr_b + COLS_B;
5388
5389 src_addr_a += offset_row_a;
5390 src_addr_b += offset_row_b;
5391
5392 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005393 half8 c0 = 0.0f;
5394 half8 c1 = 0.0f;
5395 half8 c2 = 0.0f;
5396 half8 c3 = 0.0f;
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005397
5398#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
5399
5400 int i = 0;
5401 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
5402 {
5403#if MULT_INTERLEAVE4X4_HEIGHT == 1
5404 // Load values from matrix A (interleaved) and matrix B (transposed)
5405 half8 a0 = vload8(0, src_addr_a);
5406 half8 b0 = vload8(0, src_addr_b);
5407
5408 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5409 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5410
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005411 c0 = fma((half8)a0.s0, b0, c0);
5412 c1 = fma((half8)a0.s1, b0, c1);
5413 c2 = fma((half8)a0.s2, b0, c2);
5414 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005415
5416 // Load values from matrix B (transposed)
5417 b0 = vload8(0, src_addr_b);
5418
5419 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5420
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005421 c0 = fma((half8)a0.s4, b0, c0);
5422 c1 = fma((half8)a0.s5, b0, c1);
5423 c2 = fma((half8)a0.s6, b0, c2);
5424 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005425
5426 // Load values from matrix A (interleaved) and matrix B (transposed)
5427 a0 = vload8(0, src_addr_a);
5428 b0 = vload8(0, src_addr_b);
5429
5430 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5431 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5432
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005433 c0 = fma((half8)a0.s0, b0, c0);
5434 c1 = fma((half8)a0.s1, b0, c1);
5435 c2 = fma((half8)a0.s2, b0, c2);
5436 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005437
5438 // Load values from matrix B (transposed)
5439 b0 = vload8(0, src_addr_b);
5440
5441 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5442
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005443 c0 = fma((half8)a0.s4, b0, c0);
5444 c1 = fma((half8)a0.s5, b0, c1);
5445 c2 = fma((half8)a0.s6, b0, c2);
5446 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005447#else // MULT_INTERLEAVE4X4_HEIGHT == 1
5448 // Load values from matrix A (interleaved) and matrix B (transposed)
5449 half4 a0 = vload4(0, src_addr_a);
5450 half8 b0 = vload8(0, src_addr_b);
5451
5452 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5453 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5454
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005455 c0 = fma((half8)a0.s0, b0, c0);
5456 c1 = fma((half8)a0.s1, b0, c1);
5457 c2 = fma((half8)a0.s2, b0, c2);
5458 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005459
5460 // Load values from matrix A (interleaved) and matrix B (transposed)
5461 a0 = vload4(0, src_addr_a);
5462 b0 = vload8(0, src_addr_b);
5463
5464 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5465 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5466
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005467 c0 = fma((half8)a0.s0, b0, c0);
5468 c1 = fma((half8)a0.s1, b0, c1);
5469 c2 = fma((half8)a0.s2, b0, c2);
5470 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005471
5472 // Load values from matrix A (interleaved) and matrix B (transposed)
5473 a0 = vload4(0, src_addr_a);
5474 b0 = vload8(0, src_addr_b);
5475
5476 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5477 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5478
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005479 c0 = fma((half8)a0.s0, b0, c0);
5480 c1 = fma((half8)a0.s1, b0, c1);
5481 c2 = fma((half8)a0.s2, b0, c2);
5482 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005483
5484 // Load values from matrix A (interleaved) and matrix B (transposed)
5485 a0 = vload4(0, src_addr_a);
5486 b0 = vload8(0, src_addr_b);
5487
5488 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5489 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5490
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005491 c0 = fma((half8)a0.s0, b0, c0);
5492 c1 = fma((half8)a0.s1, b0, c1);
5493 c2 = fma((half8)a0.s2, b0, c2);
5494 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005495#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
5496 }
5497
5498 for(; i < (int)(COLS_MTX_B); ++i)
5499 {
5500 // Load values from matrix A (interleaved) and matrix B (transposed)
5501 half4 a0 = vload4(0, src_addr_a);
5502 half8 b0 = vload8(0, src_addr_b);
5503
5504 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5505 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5506
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005507 c0 = fma((half8)a0.s0, b0, c0);
5508 c1 = fma((half8)a0.s1, b0, c1);
5509 c2 = fma((half8)a0.s2, b0, c2);
5510 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005511 }
5512
5513 // Compute destination address
5514 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5515
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005516 // Compute dst address
5517 __global uchar *dst_addr = offset(&dst, 0, 0);
5518
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005519 uint4 zout = 0;
5520
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005521#if defined(REINTERPRET_OUTPUT_AS_3D)
5522 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005523 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005524 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005525 // | |
5526 // | plane0 |
5527 // | |
5528 // |__________________|
5529 // |******************|
5530 // | cross_plane_pad |
5531 // |******************|
5532 // | |
5533 // | plane1 |
5534 // | |
5535 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005536
5537 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005538 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5539 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005540
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005541 // Add offset due to the cross plane paddings
5542 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005543
5544 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5545 // multiply dst_stride_z by DEPTH_GEMM3D
5546 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005547#else // defined(REINTERPRET_OUTPUT_AS_3D)
5548 // Add offset for batched GEMM
5549 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005550#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5551
5552 // Multiply by the weight of matrix-matrix product and store the result
5553#if defined(ALPHA)
5554 SCALE_BLOCK(4, half, c, ALPHA);
5555#endif // defined(ALPHA)
5556
5557 // Add beta*bias
5558#if defined(BETA)
5559 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5560
5561#if defined(BROADCAST_BIAS)
5562 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5563
5564 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5565
5566#ifndef UNIT_BETA
5567 SCALE_BLOCK(1, half, bias, BETA);
5568#endif // UNIT_BIAS
5569
5570 // c = c + bias[broadcasted]
5571 ADD_BLOCK_BROADCAST(4, c, bias0);
5572
5573#else // defined(BROADCAST_BIAS)
5574 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5575 2) * src2_stride_z;
5576
5577 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5578
5579#ifndef UNIT_BETA
5580 SCALE_BLOCK(4, half, bias, BETA);
5581#endif // UNIT_BIAS
5582
5583 // c = c + bias
5584 ADD_BLOCK(4, c, bias);
5585
5586#endif // defined(BROADCAST_BIAS)
5587#endif // defined(BETA)
5588
5589#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01005590 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005591#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005592
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005593 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005594 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5595 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5596 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5597 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005598}
Georgios Pinitas84225582018-05-14 12:00:05 +01005599
5600// Undefine local defines
5601#undef COLS_MTX_B
5602
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01005603#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005604
Gian Marco36a0a462018-01-12 10:21:40 +00005605#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005606
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005607#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
5608#if defined(DATA_TYPE)
5609#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005610/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
5611 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005612 * @note This OpenCL kernel works with floating point data types (F16/F32)
5613 * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
5614 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005615 * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005616 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5617 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005618 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005619 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5620 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005621 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5622 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005623 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5624 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5625 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5626 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5627 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005628 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005629 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5630 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5631 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5632 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5633 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005634 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005635 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5636 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5637 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5638 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5639 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005640 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5641 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5642 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5643 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5644 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5645 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005646 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005647 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5648 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5649 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5650 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5651 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005652 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5653 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005654 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005655 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005656 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5657 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005658 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005659__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
5660 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005661#if defined(BETA)
5662 IMAGE_DECLARATION(src2),
5663#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005664 IMAGE_DECLARATION(dst),
5665 uint src0_stride_z,
5666 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005667#if defined(BETA)
5668 uint src2_stride_z,
5669#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005670 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005671#if defined(REINTERPRET_INPUT_AS_3D)
5672 ,
5673 uint src_cross_plane_pad
5674#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005675#if defined(REINTERPRET_OUTPUT_AS_3D)
5676 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005677 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005678#endif // REINTERPRET_OUTPUT_AS_3D
5679 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005680{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005681 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005682
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005683 // Compute starting address for matrix A and Matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005684 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005685
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005686 // Update address for the matrix A
5687 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005688
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005689 // Update address for the matrix B
5690 src_addr.s1 += idx * sizeof(DATA_TYPE);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005691
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005692#if defined(REINTERPRET_INPUT_AS_3D)
5693 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
5694 // in order to take into account the presence of possible cross plane paddings
5695 //
5696 // | |
5697 // | plane0 |
5698 // | |
5699 // |__________________|
5700 // |******************|
5701 // | cross_plane_pad |
5702 // |******************|
5703 // | |
5704 // | plane1 |
5705 // | |
5706 // |__________________|
5707
5708 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
5709 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5710 zin = min(DEPTH_GEMM3D - 1, zin);
5711
5712 // Add offset due to the cross plane paddings
5713 zin *= (src_cross_plane_pad * src0_stride_y);
5714
5715 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5716 // multiply src0_stride_z by DEPTH_GEMM3D
5717 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
5718
5719#else // defined(REINTERPRET_INPUT_AS_3D)
5720
Gian Marcoae2af742018-02-15 12:35:44 +00005721 // Add offset for batched GEMM
5722 src_addr.s0 += get_global_id(2) * src0_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005723
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005724#endif // defined(REINTERPRET_INPUT_AS_3D)
5725
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005726#if defined(MATRIX_B_DEPTH)
5727 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5728 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
5729#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005730 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005731#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005732
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005733 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
5734
5735 VECTOR_TYPE acc0 = 0.0f;
5736#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5737 VECTOR_TYPE acc1 = 0.0f;
5738#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5739#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5740 VECTOR_TYPE acc2 = 0.0f;
5741#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5742#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5743 VECTOR_TYPE acc3 = 0.0f;
5744#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5745
Georgios Pinitas96880cf2017-10-20 18:52:20 +01005746 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005747 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005748#if defined(REINTERPRET_INPUT_AS_3D)
5749 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01005750 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
5751#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005752 // Load values from matrix A
5753 VEC_DATA_TYPE(DATA_TYPE, 2)
5754 a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5755#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5756 VEC_DATA_TYPE(DATA_TYPE, 2)
5757 a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5758#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5759#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5760 VEC_DATA_TYPE(DATA_TYPE, 2)
5761 a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5762#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5763#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5764 VEC_DATA_TYPE(DATA_TYPE, 2)
5765 a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5766#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005767#endif // defined(REINTERPRET_INPUT_AS_3D)
5768
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005769 // Load values from matrix B
5770 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
5771 VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005772
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005773 // Accumulate
5774 acc0 += b0 * (VECTOR_TYPE)a0.s0;
5775 acc0 += b1 * (VECTOR_TYPE)a0.s1;
5776#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5777 acc1 += b0 * (VECTOR_TYPE)a1.s0;
5778 acc1 += b1 * (VECTOR_TYPE)a1.s1;
5779#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5780#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5781 acc2 += b0 * (VECTOR_TYPE)a2.s0;
5782 acc2 += b1 * (VECTOR_TYPE)a2.s1;
5783#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5784#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5785 acc3 += b0 * (VECTOR_TYPE)a3.s0;
5786 acc3 += b1 * (VECTOR_TYPE)a3.s1;
5787#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005788 }
5789
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005790 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005791 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005792#if defined(REINTERPRET_INPUT_AS_3D)
5793 // Load values from matrix A
5794 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
5795#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5796 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
5797#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5798#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5799 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
5800#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5801#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5802 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
5803#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5804#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005805 // Load values from matrix A
5806 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5807#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5808 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5809#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5810#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5811 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5812#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5813#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5814 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5815#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005816#endif // defined(REINTERPRET_INPUT_AS_3D)
5817
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005818 // Load values from matrix B
5819 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005820
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005821 // Accumulate
5822 acc0 += b0 * (VECTOR_TYPE)a0;
5823#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5824 acc1 += b0 * (VECTOR_TYPE)a1;
5825#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5826#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5827 acc2 += b0 * (VECTOR_TYPE)a2;
5828#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5829#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5830 acc3 += b0 * (VECTOR_TYPE)a3;
5831#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005832 }
5833
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005834 int z = get_global_id(2);
5835
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005836 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005837 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5838
Gian Marcoae2af742018-02-15 12:35:44 +00005839 // Compute dst address
5840 __global uchar *dst_addr = offset(&dst, 0, 0);
5841
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005842 uint4 zout = 0;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005843
5844#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005845
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005846 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005847 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005848 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005849 // | |
5850 // | plane0 |
5851 // | |
5852 // |__________________|
5853 // |******************|
5854 // | cross_plane_pad |
5855 // |******************|
5856 // | |
5857 // | plane1 |
5858 // | |
5859 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005860
5861 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005862 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5863 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005864
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005865 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005866 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005867
5868 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5869 // multiply dst_stride_z by DEPTH_GEMM3D
5870 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005871#else // defined(REINTERPRET_OUTPUT_AS_3D)
5872 // Add offset for batched GEMM
5873 dst_addr += z * dst_stride_z;
5874#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5875
5876 // Multiply by the weight of matrix-matrix product and store the result
5877#if defined(ALPHA)
5878 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
5879#endif // defined(ALPHA)
5880
5881 // Add beta*bias
5882#if defined(BETA)
5883 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
5884
5885#if defined(BROADCAST_BIAS)
5886 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
5887
5888 LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5889
5890#ifndef UNIT_BETA
5891 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5892#endif // UNIT_BIAS
5893
5894 // c = c + bias[broadcasted]
5895 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
5896
5897#else // defined(BROADCAST_BIAS)
5898 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *
5899 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
5900
5901 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5902
5903#ifndef UNIT_BETA
5904 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
5905#endif // UNIT_BIAS
5906
5907 // c = c + bias
5908 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
5909
5910#endif // defined(BROADCAST_BIAS)
5911#endif // defined(BETA)
5912
5913#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01005914 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005915#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005916
5917 // Store output block
Usama Arif0681e3b2019-04-25 14:28:07 +01005918 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005919}
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005920#endif // defined(DATA_TYPE)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005921
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +01005922/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005923 *
5924 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
5925 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
5926 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
5927 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
5928 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005929 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5930 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005931 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005932 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5933 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005934 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5935 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005936 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5937 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5938 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5939 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5940 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005941 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005942 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5943 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5944 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5945 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5946 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5947 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5948 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5949 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5950 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5951 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5952 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005953 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5954 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5955 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5956 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5957 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5958 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005959 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5960 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5961 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5962 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5963 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5964 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005965 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5966 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005967 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005968 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005969 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5970 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005971 */
5972__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
5973 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005974#if defined(BETA)
5975 IMAGE_DECLARATION(src2),
5976#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005977 IMAGE_DECLARATION(dst),
5978 uint src0_stride_z,
5979 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005980#if defined(BETA)
5981 uint src2_stride_z,
5982#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005983 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005984#if defined(REINTERPRET_INPUT_AS_3D)
5985 ,
5986 uint src_cross_plane_pad
5987#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005988#if defined(REINTERPRET_OUTPUT_AS_3D)
5989 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005990 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005991#endif // REINTERPRET_OUTPUT_AS_3D
5992 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005993{
5994 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
5995
5996 // Compute starting address for matrix A and matrix B
5997 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
5998
5999 // Update address for matrix A
6000 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6001
6002 // Update address for matrix B
6003 src_addr.s1 += idx * sizeof(float);
6004
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006005#if defined(REINTERPRET_INPUT_AS_3D)
6006 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6007 // in order to take into account the presence of possible cross plane paddings
6008 //
6009 // | |
6010 // | plane0 |
6011 // | |
6012 // |__________________|
6013 // |******************|
6014 // | cross_plane_pad |
6015 // |******************|
6016 // | |
6017 // | plane1 |
6018 // | |
6019 // |__________________|
6020
6021 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6022 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6023 zin = min(DEPTH_GEMM3D - 1, zin);
6024
6025 // Add offset due to the cross plane paddings
6026 zin *= (src_cross_plane_pad * src0_stride_y);
6027
6028 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6029 // multiply src0_stride_z by DEPTH_GEMM3D
6030 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6031
6032#else // defined(REINTERPRET_INPUT_AS_3D)
6033
Gian Marcoae2af742018-02-15 12:35:44 +00006034 // Add offset for batched GEMM
6035 src_addr.s0 += get_global_id(2) * src0_stride_z;
6036
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006037#endif // defined(REINTERPRET_INPUT_AS_3D)
6038
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006039#if defined(MATRIX_B_DEPTH)
6040 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6041 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6042#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006043 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006044#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006045
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006046 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006047 float4 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006048
6049#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006050 float4 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006051#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6052
6053#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006054 float4 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006055#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6056
6057#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006058 float4 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006059#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6060
6061 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006062 int i = 0;
6063 for(; i <= ((int)COLS_A - 4); i += 4)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006064 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006065#if defined(REINTERPRET_INPUT_AS_3D)
6066 // Load values from matrix A and matrix B
Usama Arif0681e3b2019-04-25 14:28:07 +01006067 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6068#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006069 // Load values from matrix A and matrix B
6070 float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006071#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006072 float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006073#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6074#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006075 float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006076#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6077#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006078 float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006079#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006080#endif // defined(REINTERPRET_INPUT_AS_3D)
6081
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006082 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6083 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006084
6085 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006086 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6087 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6088 acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
6089 acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006090
6091#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006092
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006093 acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
6094 acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
6095 acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
6096 acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006097
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006098#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6099#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006100
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006101 acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
6102 acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
6103 acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
6104 acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006105
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006106#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6107#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006108
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006109 acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
6110 acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
6111 acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
6112 acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006113#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006114
6115 // Load values from matrix A and matrix B
6116 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6117 src_addr.s1 += src1_stride_y;
6118
6119 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006120 acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
6121 acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
6122 acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
6123 acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006124
6125#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6126
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006127 acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
6128 acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
6129 acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
6130 acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006131
6132#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6133#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6134
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006135 acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
6136 acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
6137 acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
6138 acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006139
6140#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6141#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6142
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006143 acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
6144 acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
6145 acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
6146 acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006147#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6148
6149 // Load values from matrix A and matrix B
6150 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6151 src_addr.s1 += src1_stride_y;
6152
6153 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006154 acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
6155 acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
6156 acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
6157 acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006158
6159#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6160
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006161 acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
6162 acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
6163 acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
6164 acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006165
6166#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6167#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6168
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006169 acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
6170 acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
6171 acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
6172 acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006173
6174#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6175#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6176
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006177 acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
6178 acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
6179 acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
6180 acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006181#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6182
6183 // Load values from matrix A and matrix B
6184 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6185 src_addr.s1 += src1_stride_y;
6186
6187 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006188 acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
6189 acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
6190 acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
6191 acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006192
6193#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6194
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006195 acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
6196 acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
6197 acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
6198 acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006199
6200#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6201#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6202
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006203 acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
6204 acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
6205 acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
6206 acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006207
6208#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6209#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6210
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006211 acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
6212 acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
6213 acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
6214 acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006215#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6216
6217 src_addr.s0 += 4 * sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006218 }
6219
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006220 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006221 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006222#if defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006223 // Load values from matrix A
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006224 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6225#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6226 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6227#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6228#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6229 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6230#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6231#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6232 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6233#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6234#else // defined(REINTERPRET_INPUT_AS_3D)
6235 // Load values from matrix A
6236 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006237#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6238 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6239#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6240#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6241 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6242#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6243#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6244 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6245#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006246#endif // defined(REINTERPRET_INPUT_AS_3D)
6247
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006248 // Load values from matrix B
6249 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006250 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006251
6252 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006253 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6254 acc0.s1 = fma(a0, b0.s1, acc0.s1);
6255 acc0.s2 = fma(a0, b0.s2, acc0.s2);
6256 acc0.s3 = fma(a0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006257#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006258 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6259 acc1.s1 = fma(a1, b0.s1, acc1.s1);
6260 acc1.s2 = fma(a1, b0.s2, acc1.s2);
6261 acc1.s3 = fma(a1, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006262#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6263#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006264 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6265 acc2.s1 = fma(a2, b0.s1, acc2.s1);
6266 acc2.s2 = fma(a2, b0.s2, acc2.s2);
6267 acc2.s3 = fma(a2, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006268#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6269#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006270 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6271 acc3.s1 = fma(a3, b0.s1, acc3.s1);
6272 acc3.s2 = fma(a3, b0.s2, acc3.s2);
6273 acc3.s3 = fma(a3, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006274#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006275
6276 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006277 }
6278
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006279 int z = get_global_id(2);
6280
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006281 // Compute destination address
6282 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6283
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006284 // Compute dst address
6285 __global uchar *dst_addr = offset(&dst, 0, 0);
6286
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006287 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006288
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006289#if defined(REINTERPRET_OUTPUT_AS_3D)
6290 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006291 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006292 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006293 // | |
6294 // | plane0 |
6295 // | |
6296 // |__________________|
6297 // |******************|
6298 // | cross_plane_pad |
6299 // |******************|
6300 // | |
6301 // | plane1 |
6302 // | |
6303 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006304
6305 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006306 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6307 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006308
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006309 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006310 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006311
6312 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6313 // multiply dst_stride_z by DEPTH_GEMM3D
6314 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006315#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006316 // Add offset for batched GEMM
6317 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006318#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6319
6320 // Multiply by the weight of matrix-matrix product and store the result
6321#if defined(ALPHA)
6322 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6323#endif // defined(ALPHA)
6324
6325 // Add beta*bias
6326#if defined(BETA)
6327 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6328
6329#if defined(BROADCAST_BIAS)
6330 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
6331
6332 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6333
6334#ifndef UNIT_BETA
6335 SCALE_BLOCK(1, float, bias, BETA);
6336#endif // UNIT_BIAS
6337
6338 // acc = acc + bias[broadcasted]
6339 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6340
6341#else // defined(BROADCAST_BIAS)
6342 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *
6343 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6344
6345 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6346
6347#ifndef UNIT_BETA
6348 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6349#endif // UNIT_BIAS
6350
6351 // acc = acc + bias
6352 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6353
6354#endif // defined(BROADCAST_BIAS)
6355#endif // defined(BETA)
6356
6357#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01006358 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006359#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006360
6361 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006362 vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006363#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006364 vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006365#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6366#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006367 vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006368#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6369#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006370 vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006371#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006372}
6373
6374/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
6375 *
6376 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
6377 * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
6378 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6379 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
6380 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6381 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006382 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6383 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006384 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006385 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6386 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006387 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6388 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006389 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6390 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6391 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6392 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6393 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006394 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006395 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6396 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6397 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6398 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6399 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6400 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6401 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6402 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6403 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6404 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6405 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006406 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6407 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6408 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6409 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6410 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6411 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006412 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6413 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6414 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6415 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6416 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6417 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006418 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6419 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006420 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006421 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006422 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6423 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006424 */
6425__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
6426 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006427#if defined(BETA)
6428 IMAGE_DECLARATION(src2),
6429#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00006430 IMAGE_DECLARATION(dst),
6431 uint src0_stride_z,
6432 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006433#if defined(BETA)
6434 uint src2_stride_z,
6435#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006436 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006437#if defined(REINTERPRET_INPUT_AS_3D)
6438 ,
6439 uint src_cross_plane_pad
6440#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006441#if defined(REINTERPRET_OUTPUT_AS_3D)
6442 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006443 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006444#endif // REINTERPRET_OUTPUT_AS_3D
6445 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006446{
6447 // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6448 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6449
6450 // Compute starting address for matrix A and Matrix B
6451 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6452
6453 // Update address for the matrix A
6454 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6455
6456 // Update address for the matrix B
6457 src_addr.s1 += idx * sizeof(float);
6458
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006459#if defined(REINTERPRET_INPUT_AS_3D)
6460 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6461 // in order to take into account the presence of possible cross plane paddings
6462 //
6463 // | |
6464 // | plane0 |
6465 // | |
6466 // |__________________|
6467 // |******************|
6468 // | cross_plane_pad |
6469 // |******************|
6470 // | |
6471 // | plane1 |
6472 // | |
6473 // |__________________|
6474
6475 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6476 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6477 zin = min(DEPTH_GEMM3D - 1, zin);
6478
6479 // Add offset due to the cross plane paddings
6480 zin *= (src_cross_plane_pad * src0_stride_y);
6481
6482 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6483 // multiply src0_stride_z by DEPTH_GEMM3D
6484 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6485
6486#else // defined(REINTERPRET_INPUT_AS_3D)
6487
Gian Marcoae2af742018-02-15 12:35:44 +00006488 // Add offset for batched GEMM
6489 src_addr.s0 += get_global_id(2) * src0_stride_z;
6490
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006491#endif // defined(REINTERPRET_INPUT_AS_3D)
6492
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006493#if defined(MATRIX_B_DEPTH)
6494 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6495 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6496#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006497 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006498#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006499
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006500 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006501 float2 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006502#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006503 float2 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006504#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6505#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006506 float2 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006507#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6508#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006509 float2 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006510#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6511
6512 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006513 int i = 0;
6514 for(; i <= ((int)COLS_A - 8); i += 8)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006515 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006516#if defined(REINTERPRET_INPUT_AS_3D)
6517 // Load values from matrix A
6518 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
6519#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006520 // Load values from matrix A
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006521 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006522#endif // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006523
6524 // Load values from matrix B
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006525 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6526 src_addr.s1 += src1_stride_y;
6527 float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6528 src_addr.s1 += src1_stride_y;
6529 float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6530 src_addr.s1 += src1_stride_y;
6531 float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6532 src_addr.s1 += src1_stride_y;
6533 float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6534 src_addr.s1 += src1_stride_y;
6535 float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6536 src_addr.s1 += src1_stride_y;
6537 float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6538 src_addr.s1 += src1_stride_y;
6539 float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6540 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006541
6542 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006543 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6544 acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
6545 acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
6546 acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
6547 acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
6548 acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
6549 acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
6550 acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006551
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006552 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6553 acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
6554 acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
6555 acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
6556 acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
6557 acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
6558 acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
6559 acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006560
6561#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006562#if defined(REINTERPRET_INPUT_AS_3D)
6563 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6564#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006565 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006566#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006567 acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
6568 acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
6569 acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
6570 acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
6571 acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
6572 acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
6573 acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
6574 acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006575
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006576 acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
6577 acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
6578 acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
6579 acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
6580 acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
6581 acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
6582 acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
6583 acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006584#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6585#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006586#if defined(REINTERPRET_INPUT_AS_3D)
6587 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6588#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006589 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006590#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006591 acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
6592 acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
6593 acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
6594 acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
6595 acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
6596 acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
6597 acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
6598 acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006599
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006600 acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
6601 acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
6602 acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
6603 acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
6604 acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
6605 acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
6606 acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
6607 acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006608#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6609#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006610#if defined(REINTERPRET_INPUT_AS_3D)
6611 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6612#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006613 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006614#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006615 acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
6616 acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
6617 acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
6618 acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
6619 acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
6620 acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
6621 acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
6622 acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006623
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006624 acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
6625 acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
6626 acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
6627 acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
6628 acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
6629 acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
6630 acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
6631 acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006632#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006633
6634 src_addr.s0 += sizeof(float) * 8;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006635 }
6636 // float size increment
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006637 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006638 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006639#if defined(REINTERPRET_INPUT_AS_3D)
6640 // Load values from matrix A
6641 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6642#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6643 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6644#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6645#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6646 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6647#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6648#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6649 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6650#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6651#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006652 // Load values from matrix A
6653 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6654#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6655 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6656#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6657#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6658 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6659#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6660#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6661 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6662#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006663#endif // defined(REINTERPRET_INPUT_AS_3D)
6664
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006665 // Load values from matrix B
6666 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006667 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006668
6669 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006670 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6671 acc0.s1 = fma(a0, b0.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006672#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006673 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6674 acc1.s1 = fma(a1, b0.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006675#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6676#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006677 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6678 acc2.s1 = fma(a2, b0.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006679#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6680#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006681 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6682 acc3.s1 = fma(a3, b0.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006683#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006684
6685 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006686 }
6687
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006688 int z = get_global_id(2);
6689
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006690 // Compute destination address
6691 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6692
Gian Marcoae2af742018-02-15 12:35:44 +00006693 // Compute dst address
6694 __global uchar *dst_addr = offset(&dst, 0, 0);
6695
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006696 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006697
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006698#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006699
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006700 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006701 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006702 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006703 // | |
6704 // | plane0 |
6705 // | |
6706 // |__________________|
6707 // |******************|
6708 // | cross_plane_pad |
6709 // |******************|
6710 // | |
6711 // | plane1 |
6712 // | |
6713 // |__________________|
Gian Marcoae2af742018-02-15 12:35:44 +00006714
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006715 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006716 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6717 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006718
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006719 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006720 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006721
6722 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6723 // multiply dst_stride_z by DEPTH_GEMM3D
6724 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006725#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006726 // Add offset for batched GEMM
6727 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006728#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6729
6730 // Multiply by the weight of matrix-matrix product and store the result
6731#if defined(ALPHA)
6732 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6733#endif // defined(ALPHA)
6734
6735 // Add beta*bias
6736#if defined(BETA)
6737 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6738
6739#if defined(BROADCAST_BIAS)
6740 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
6741
6742 LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6743
6744#ifndef UNIT_BETA
6745 SCALE_BLOCK(1, float, bias, BETA);
6746#endif // UNIT_BIAS
6747
6748 // acc = acc + bias[broadcasted]
6749 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6750
6751#else // defined(BROADCAST_BIAS)
6752 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *
6753 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6754
6755 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6756
6757#ifndef UNIT_BETA
6758 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6759#endif // UNIT_BIAS
6760
6761 // acc = acc + bias
6762 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6763
6764#endif // defined(BROADCAST_BIAS)
6765#endif // defined(BETA)
6766
6767#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01006768 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006769#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006770
6771 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006772 vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006773#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006774 vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006775#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6776#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006777 vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006778#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6779#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006780 vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006781#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006782}
6783
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01006784#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01006785/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
6786 *
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006787 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
6788 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6789 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
6790 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6791 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006792 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6793 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006794 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006795 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6796 * The activation function is performed after the bias addition
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006797 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6798 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
6799 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6800 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6801 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6802 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6803 *
6804 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
6805 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6806 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6807 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6808 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6809 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6810 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6811 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6812 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6813 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6814 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6815 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006816 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6817 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6818 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6819 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6820 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6821 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006822 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6823 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6824 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6825 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6826 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6827 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
6828 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6829 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006830 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006831 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
6832 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6833 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
6834 */
6835__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
6836 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006837#if defined(BETA)
6838 IMAGE_DECLARATION(src2),
6839#endif // defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006840 IMAGE_DECLARATION(dst),
6841 uint src0_stride_z,
6842 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006843#if defined(BETA)
6844 uint src2_stride_z,
6845#endif //defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006846 uint dst_stride_z
6847#if defined(REINTERPRET_INPUT_AS_3D)
6848 ,
6849 uint src_cross_plane_pad
6850#endif // REINTERPRET_INPUT_AS_3D
6851#if defined(REINTERPRET_OUTPUT_AS_3D)
6852 ,
6853 uint dst_cross_plane_pad
6854#endif // REINTERPRET_OUTPUT_AS_3D
6855 )
6856{
6857 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6858
6859 // Compute starting address for matrix A and Matrix B
6860 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6861
6862 // Update address for the matrix A
6863 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6864
6865 // Update address for the matrix B
6866 src_addr.s1 += idx * sizeof(half);
6867
6868#if defined(REINTERPRET_INPUT_AS_3D)
6869 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6870 // in order to take into account the presence of possible cross plane paddings
6871 //
6872 // | |
6873 // | plane0 |
6874 // | |
6875 // |__________________|
6876 // |******************|
6877 // | cross_plane_pad |
6878 // |******************|
6879 // | |
6880 // | plane1 |
6881 // | |
6882 // |__________________|
6883
6884 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6885 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6886 zin = min(DEPTH_GEMM3D - 1, zin);
6887
6888 // Add offset due to the cross plane paddings
6889 zin *= (src_cross_plane_pad * src0_stride_y);
6890
6891 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6892 // multiply src0_stride_z by DEPTH_GEMM3D
6893 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6894
6895#else // defined(REINTERPRET_INPUT_AS_3D)
6896
6897 // Add offset for batched GEMM
6898 src_addr.s0 += get_global_id(2) * src0_stride_z;
6899
6900#endif // defined(REINTERPRET_INPUT_AS_3D)
6901
6902#if defined(MATRIX_B_DEPTH)
6903 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6904 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6905#else // defined(MATRIX_B_DEPTH)
6906 src_addr.s1 += get_global_id(2) * src1_stride_z;
6907#endif // defined(MATRIX_B_DEPTH)
6908
6909 float8 acc0 = 0.0h;
6910#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6911 float8 acc1 = 0.0h;
6912#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6913#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6914 float8 acc2 = 0.0h;
6915#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6916#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6917 float8 acc3 = 0.0h;
6918#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6919
6920 int i = 0;
6921 for(; i <= ((int)COLS_A - 4); i += 4)
6922 {
6923#if defined(REINTERPRET_INPUT_AS_3D)
6924 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01006925 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6926#else // defined(REINTERPRET_INPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006927 // Load values from matrix A
6928 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6929#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6930 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6931#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6932#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6933 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6934#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6935#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6936 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6937#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6938#endif // defined(REINTERPRET_INPUT_AS_3D)
6939
6940 // Load values from matrix B
6941 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6942 src_addr.s1 += src1_stride_y;
6943
6944 // Accumulate
6945 acc0 = fma(b0, (float8)a0.s0, acc0);
6946#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6947 acc1 = fma(b0, (float8)a1.s0, acc1);
6948#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6949#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6950 acc2 = fma(b0, (float8)a2.s0, acc2);
6951#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6952#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6953 acc3 = fma(b0, (float8)a3.s0, acc3);
6954#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6955
6956 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6957 src_addr.s1 += src1_stride_y;
6958 acc0 = fma(b0, (float8)a0.s1, acc0);
6959#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6960 acc1 = fma(b0, (float8)a1.s1, acc1);
6961#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6962#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6963 acc2 = fma(b0, (float8)a2.s1, acc2);
6964#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6965#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6966 acc3 = fma(b0, (float8)a3.s1, acc3);
6967#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6968
6969 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6970 src_addr.s1 += src1_stride_y;
6971 acc0 = fma(b0, (float8)a0.s2, acc0);
6972#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6973 acc1 = fma(b0, (float8)a1.s2, acc1);
6974#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6975#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6976 acc2 = fma(b0, (float8)a2.s2, acc2);
6977#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6978#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6979 acc3 = fma(b0, (float8)a3.s2, acc3);
6980#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6981
6982 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6983 src_addr.s1 += src1_stride_y;
6984 acc0 = fma(b0, (float8)a0.s3, acc0);
6985#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6986 acc1 = fma(b0, (float8)a1.s3, acc1);
6987#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6988#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6989 acc2 = fma(b0, (float8)a2.s3, acc2);
6990#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6991#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6992 acc3 = fma(b0, (float8)a3.s3, acc3);
6993#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6994
6995 src_addr.s0 += 4 * sizeof(half);
6996 }
6997
6998 for(; i < (int)COLS_A; ++i)
6999 {
7000#if defined(REINTERPRET_INPUT_AS_3D)
7001 // Load values from matrix A
7002 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
7003#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7004 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
7005#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7006#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7007 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
7008#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7009#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7010 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
7011#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7012#else // defined(REINTERPRET_INPUT_AS_3D)
7013 // Load values from matrix A
7014 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7015#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7016 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7017#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7018#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7019 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7020#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7021#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7022 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7023#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7024#endif // defined(REINTERPRET_INPUT_AS_3D)
7025
7026 // Load values from matrix B
7027 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
7028
7029 src_addr += (int2)(sizeof(half), src1_stride_y);
7030
7031 // Accumulate
7032 acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
7033#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7034 acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
7035#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7036#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7037 acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
7038#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7039#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7040 acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
7041#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7042 }
7043
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007044 int z = get_global_id(2);
7045
7046 // Compute destination address
7047 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7048
7049 // Compute dst address
7050 __global uchar *dst_addr = offset(&dst, 0, 0);
7051
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007052 uint4 zout = 0;
7053
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007054#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007055
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007056 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
7057 // in order to take into account the presence of possible cross plane paddings
7058 //
7059 // | |
7060 // | plane0 |
7061 // | |
7062 // |__________________|
7063 // |******************|
7064 // | cross_plane_pad |
7065 // |******************|
7066 // | |
7067 // | plane1 |
7068 // | |
7069 // |__________________|
7070
7071 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007072 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7073 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007074
7075 // Add offset due to the cross plane paddings
7076 zout *= (dst_cross_plane_pad * dst_stride_y);
7077
7078 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7079 // multiply dst_stride_z by DEPTH_GEMM3D
7080 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007081#else // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007082 // Add offset for batched GEMM
7083 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007084#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007085
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007086 // Multiply by the weight of matrix-matrix product and store the result
7087#if defined(ALPHA)
7088 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
7089#endif // defined(ALPHA)
7090
7091#if defined(BETA)
7092 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7093
7094#if defined(BROADCAST_BIAS)
7095 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7096
7097 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7098
7099 float8 bias_f0 = convert_float8(bias0);
7100
7101#ifndef UNIT_BETA
7102 SCALE_BLOCK(1, float, bias_f, BETA);
7103#endif // UNIT_BIAS
7104
7105 // acc = acc + bias[broadcasted]
7106 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
7107
7108#else // defined(BROADCAST_BIAS)
7109 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7110 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7111
7112 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7113
7114 float8 bias_f0 = convert_float8(bias0);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007115#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007116 float8 bias_f1 = convert_float8(bias1);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007117#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7118#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007119 float8 bias_f2 = convert_float8(bias2);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007120#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7121#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007122 float8 bias_f3 = convert_float8(bias3);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007123#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007124
7125#ifndef UNIT_BETA
7126 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
7127#endif // UNIT_BIAS
7128
7129 // acc = acc + bias
7130 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
7131
7132#endif // defined(BROADCAST_BIAS)
7133#endif // defined(BETA)
7134
7135 half8 acc_h0 = convert_half8(acc0);
7136#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7137 half8 acc_h1 = convert_half8(acc1);
7138#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7139#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7140 half8 acc_h2 = convert_half8(acc2);
7141#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7142#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7143 half8 acc_h3 = convert_half8(acc3);
7144#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7145
7146#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01007147 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007148#endif // defined(ACTIVATION_TYPE)
7149
7150 // Store the output block
7151 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007152}
7153
7154/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
7155 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007156 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
7157 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
7158 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
7159 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
7160 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007161 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
7162 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007163 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007164 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
7165 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007166 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
7167 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007168 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
7169 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
7170 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
7171 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
7172 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007173 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
7174 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7175 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7176 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7177 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7178 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
7179 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
7180 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7181 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7182 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7183 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7184 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007185 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
7186 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
7187 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
7188 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
7189 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
7190 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007191 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
7192 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7193 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7194 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7195 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7196 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007197 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
7198 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007199 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007200 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007201 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
7202 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007203 */
7204__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
7205 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007206#if defined(BETA)
7207 IMAGE_DECLARATION(src2),
7208#endif // defined(BETA)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007209 IMAGE_DECLARATION(dst),
7210 uint src0_stride_z,
7211 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007212#if defined(BETA)
7213 uint src2_stride_z,
7214#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007215 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007216#if defined(REINTERPRET_INPUT_AS_3D)
7217 ,
7218 uint src_cross_plane_pad
7219#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007220#if defined(REINTERPRET_OUTPUT_AS_3D)
7221 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007222 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007223#endif // REINTERPRET_OUTPUT_AS_3D
7224 )
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007225{
7226 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
7227
7228 // Compute starting address for matrix A and Matrix B
7229 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
7230
7231 // Update address for the matrix A
7232 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
7233
7234 // Update address for the matrix B
7235 src_addr.s1 += idx * sizeof(half);
7236
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007237#if defined(REINTERPRET_INPUT_AS_3D)
7238 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
7239 // in order to take into account the presence of possible cross plane paddings
7240 //
7241 // | |
7242 // | plane0 |
7243 // | |
7244 // |__________________|
7245 // |******************|
7246 // | cross_plane_pad |
7247 // |******************|
7248 // | |
7249 // | plane1 |
7250 // | |
7251 // |__________________|
7252
7253 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
7254 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7255 zin = min(DEPTH_GEMM3D - 1, zin);
7256
7257 // Add offset due to the cross plane paddings
7258 zin *= (src_cross_plane_pad * src0_stride_y);
7259
7260 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7261 // multiply src0_stride_z by DEPTH_GEMM3D
7262 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
7263
7264#else // defined(REINTERPRET_INPUT_AS_3D)
7265
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007266 // Add offset for batched GEMM
7267 src_addr.s0 += get_global_id(2) * src0_stride_z;
7268
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007269#endif // defined(REINTERPRET_INPUT_AS_3D)
7270
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007271#if defined(MATRIX_B_DEPTH)
7272 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
7273 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
7274#else // defined(MATRIX_B_DEPTH)
7275 src_addr.s1 += get_global_id(2) * src1_stride_z;
7276#endif // defined(MATRIX_B_DEPTH)
7277
7278 half8 acc0 = 0.0h;
7279#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7280 half8 acc1 = 0.0h;
7281#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7282#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7283 half8 acc2 = 0.0h;
7284#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7285#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7286 half8 acc3 = 0.0h;
7287#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7288
7289 int i = 0;
7290 for(; i <= ((int)COLS_A - 4); i += 4)
7291 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007292#if defined(REINTERPRET_INPUT_AS_3D)
7293 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01007294 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
7295#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007296 // Load values from matrix A
7297 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7298#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7299 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7300#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7301#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7302 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7303#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7304#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7305 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7306#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007307#endif // defined(REINTERPRET_INPUT_AS_3D)
7308
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007309 // Load values from matrix B
7310 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7311 src_addr.s1 += src1_stride_y;
7312
7313 // Accumulate
7314 acc0 = fma(b0, (half8)a0.s0, acc0);
7315#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7316 acc1 = fma(b0, (half8)a1.s0, acc1);
7317#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7318#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7319 acc2 = fma(b0, (half8)a2.s0, acc2);
7320#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7321#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7322 acc3 = fma(b0, (half8)a3.s0, acc3);
7323#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7324
7325 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7326 src_addr.s1 += src1_stride_y;
7327 acc0 = fma(b0, (half8)a0.s1, acc0);
7328#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7329 acc1 = fma(b0, (half8)a1.s1, acc1);
7330#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7331#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7332 acc2 = fma(b0, (half8)a2.s1, acc2);
7333#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7334#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7335 acc3 = fma(b0, (half8)a3.s1, acc3);
7336#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7337
7338 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7339 src_addr.s1 += src1_stride_y;
7340 acc0 = fma(b0, (half8)a0.s2, acc0);
7341#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7342 acc1 = fma(b0, (half8)a1.s2, acc1);
7343#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7344#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7345 acc2 = fma(b0, (half8)a2.s2, acc2);
7346#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7347#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7348 acc3 = fma(b0, (half8)a3.s2, acc3);
7349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7350
7351 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7352 src_addr.s1 += src1_stride_y;
7353 acc0 = fma(b0, (half8)a0.s3, acc0);
7354#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7355 acc1 = fma(b0, (half8)a1.s3, acc1);
7356#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7357#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7358 acc2 = fma(b0, (half8)a2.s3, acc2);
7359#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7360#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7361 acc3 = fma(b0, (half8)a3.s3, acc3);
7362#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7363
7364 src_addr.s0 += 4 * sizeof(half);
7365 }
7366
7367 for(; i < (int)COLS_A; ++i)
7368 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007369#if defined(REINTERPRET_INPUT_AS_3D)
7370 // Load values from matrix A
7371 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
7372#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7373 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
7374#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7375#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7376 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
7377#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7378#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7379 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
7380#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7381#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007382 // Load values from matrix A
7383 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7384#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7385 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7386#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7387#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7388 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7389#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7390#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7391 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7392#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007393#endif // defined(REINTERPRET_INPUT_AS_3D)
7394
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007395 // Load values from matrix B
7396 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7397
7398 src_addr += (int2)(sizeof(half), src1_stride_y);
7399
7400 // Accumulate
7401 acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
7402#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7403 acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
7404#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7405#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7406 acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
7407#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7408#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7409 acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
7410#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7411 }
7412
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007413 int z = get_global_id(2);
7414
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007415 // Compute destination address
7416 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7417
7418 // Compute dst address
7419 __global uchar *dst_addr = offset(&dst, 0, 0);
7420
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007421 uint4 zout = 0;
7422
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007423#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007424
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007425 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007426 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007427 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007428 // | |
7429 // | plane0 |
7430 // | |
7431 // |__________________|
7432 // |******************|
7433 // | cross_plane_pad |
7434 // |******************|
7435 // | |
7436 // | plane1 |
7437 // | |
7438 // |__________________|
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007439
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007440 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007441 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7442 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007443
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007444 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007445 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007446
7447 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7448 // multiply dst_stride_z by DEPTH_GEMM3D
7449 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007450#else // defined(REINTERPRET_OUTPUT_AS_3D)
7451 // Add offset for batched GEMM
7452 dst_addr += z * dst_stride_z;
7453#endif // defined(REINTERPRET_OUTPUT_AS_3D)
7454
7455 // Multiply by the weight of matrix-matrix product and store the result
7456#if defined(ALPHA)
7457 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
7458#endif // defined(ALPHA)
7459
7460 // Add beta*bias
7461#if defined(BETA)
7462 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7463
7464#if defined(BROADCAST_BIAS)
7465 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7466
7467 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7468
7469#ifndef UNIT_BETA
7470 SCALE_BLOCK(1, half, bias, BETA);
7471#endif // UNIT_BIAS
7472
7473 // acc = acc + bias[broadcasted]
7474 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
7475
7476#else // defined(BROADCAST_BIAS)
7477 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7478 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7479
7480 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7481
7482#ifndef UNIT_BETA
7483 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
7484#endif // UNIT_BIAS
7485
7486 // acc = acc + bias
7487 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
7488
7489#endif // defined(BROADCAST_BIAS)
7490#endif // defined(BETA)
7491
7492#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01007493 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007494#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007495
7496 // Store the output block
Usama Arif0681e3b2019-04-25 14:28:07 +01007497 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007498}
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01007499#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007500
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01007501#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007502
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007503#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007504/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7505 *
Gian Marco19835e52018-01-30 13:35:54 +00007506 * @note The beta's value need to be passed at compile time using -DBETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007507 *
7508 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
7509 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7510 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7511 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7512 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007513 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7514 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007515 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007516 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007517 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7518 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7519 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7520 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007521 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7522 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007523 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7524 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007525__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
7526 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007527{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007528 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007529 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7530 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007531
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007532 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007533 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
7534
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007535 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007536 float4 c = vload4(0, (__global float *)src.ptr);
7537
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007538 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007539 float4 out = alpha_ab + (float4)BETA * c;
7540
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007541 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007542 vstore4(out, 0, (__global float *)dst.ptr);
7543}
7544
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007545#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007546/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7547 *
Gian Marco19835e52018-01-30 13:35:54 +00007548 * @note The beta's value need to be passed at compile time using -DBETA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007549 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007550 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
7551 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7552 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7553 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7554 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007555 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7556 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007557 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007558 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007559 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7560 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7561 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7562 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007563 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7564 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007565 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7566 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007567__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
7568 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007569{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007570 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007571 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7572 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007573
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007574 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007575 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
7576
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007577 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007578 half8 c = vload8(0, (__global half *)src.ptr);
7579
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007580 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007581 half8 out = alpha_ab + (half8)BETA * c;
7582
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007583 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007584 vstore8(out, 0, (__global half *)dst.ptr);
7585}
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007586#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007587#endif // defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007588
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007589#if defined(WIDTH_VECTOR_A)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007590/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
7591 *
Gian Marco19835e52018-01-30 13:35:54 +00007592 * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007593 *
Gian Marco19835e52018-01-30 13:35:54 +00007594 * @note The input A and matrix B must not be reshaped
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007595 *
7596 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
7597 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7598 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7599 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7600 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7601 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007602 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007603 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7604 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7605 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7606 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7607 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
7608 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
7609 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007610 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007611 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7612 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7613 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7614 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7615 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7616 */
7617__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
7618 TENSOR3D_DECLARATION(src1),
7619 IMAGE_DECLARATION(dst))
7620{
7621 int idx = get_global_id(0) * 4;
7622 int idy = get_global_id(1);
7623
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007624 // Compute the address for the vector A and matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007625 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
7626 src_addr.s1 += idx * sizeof(float);
7627
7628 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
7629
7630 float4 acc = 0.0f;
7631
Georgios Pinitas96880cf2017-10-20 18:52:20 +01007632 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007633 {
7634 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
7635 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7636 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
7637
7638 acc += b0 * (float4)a0.s0;
7639 acc += b1 * (float4)a0.s1;
7640 }
7641
7642 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
7643 {
7644 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
7645 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7646
7647 acc += b0 * (float4)a0;
7648 }
7649
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007650 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007651 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7652
7653 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
7654}
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007655#endif // defined(WIDTH_VECTOR_A)